def plot_common(title=None): plt.legend(loc='best') if args.xlim is not None: plt.xlim(args.xlim) if args.ylim is not None: plt.ylim(args.ylim) if not args.title is None: title = args.title if title: plt.title(title) plt.margins(0) # autoscale(tight=True) utils.make_dir_if_not_exists(args.plots_dir) img_path = os.path.join(args.plots_dir, args.file_name) ut.mpl.save_show_fig(args, plt, img_path)
def main(): args = utils.setup_parser(codes.get_code_names(), models.keys(), utils.decoder_names).parse_args() log_level = logging.DEBUG if args.debug else logging.INFO if args.console: utils.setup_console_logger(log_level) else: utils.make_dir_if_not_exists(args.data_dir) utils.setup_file_logger(args.data_dir, 'test', log_level) print(vars(args)) test(args)
def get_or_train_cav(concepts, bottleneck, acts, cav_dir=None, cav_hparams=None, overwrite=False): """Gets, creating and training if necessary, the specified CAV. Assumes the activations already exists. Args: concepts: set of concepts used for CAV Note: if there are two concepts, provide the positive concept first, then negative concept (e.g., ['striped', 'random500_1'] bottleneck: the bottleneck used for CAV acts: dictionary contains activations of concepts in each bottlenecks e.g., acts[concept][bottleneck] cav_dir: a directory to store the results. cav_hparams: a parameter used to learn CAV overwrite: if set to True overwrite any saved CAV files. Returns: returns a CAV instance """ if cav_hparams is None: cav_hparams = CAV.default_hparams() cav_path = None if cav_dir is not None: utils.make_dir_if_not_exists(cav_dir) cav_path = os.path.join( cav_dir, CAV.cav_key(concepts, bottleneck, cav_hparams.model_type, cav_hparams.alpha).replace('/', '.') + '.pkl', ) if not overwrite and os.path.exists(cav_path): tf.logging.info('CAV already exists: {}'.format(cav_path)) cav_instance = CAV.load_cav(cav_path) return cav_instance tf.logging.info('Training CAV {} - {} alpha {}'.format( concepts, bottleneck, cav_hparams.alpha)) cav_instance = CAV(concepts, bottleneck, cav_hparams, cav_path) cav_instance.train({c: acts[c] for c in concepts}) return cav_instance
def preprocess_batch(input_path, output_path, train_ratio): '''perform preprocessing on all input data csvs''' start = time.time() files = get_filename_list(input_path, 'csv') for file in files: symbol = file.split('.')[0] print("preprocessing " + symbol) data = pd.read_csv(format_path(input_path + '/' + file), index_col='date') train_data, test_data = preprocess(data, train_ratio) formatted_output = format_path(output_path) make_dir_if_not_exists(formatted_output + '/train') make_dir_if_not_exists(formatted_output + '/test') train_data.to_csv(formatted_output + '/train' + '/' + symbol + '.csv') test_data.to_csv(formatted_output + '/test' + '/' + symbol + '.csv') print('saved csv files to ' + formatted_output + '{train, test}/' + symbol + '.csv') print("preprocessing complete") elapsed = time.time() - start print('time elapsed: ' + str(round(elapsed, 2)) + " seconds")
def main(): # Read all the data instances task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data, subtasks_list = get_multitask_instances_for_valid_tasks( task_instances_dict, tag_statistics) if args.retrain: logging.info("Creating and training the model from 'bert-base-cased' ") # Create the save_directory if not exists make_dir_if_not_exists(args.save_directory) # Initialize tokenizer and model with pretrained weights tokenizer = BertTokenizer.from_pretrained('bert-base-cased') config = BertConfig.from_pretrained('bert-base-cased') config.subtasks = subtasks_list # print(config) model = MultiTaskBertForCovidEntityClassification.from_pretrained( 'bert-base-cased', config=config) # Add new tokens in tokenizer new_special_tokens_dict = { "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"] } # new_special_tokens_dict = {"additional_special_tokens": ["<E>", "</E>"]} tokenizer.add_special_tokens(new_special_tokens_dict) # Add the new embeddings in the weights print("Embeddings type:", model.bert.embeddings.word_embeddings.weight.data.type()) print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) embedding_size = model.bert.embeddings.word_embeddings.weight.size(1) new_embeddings = torch.FloatTensor( len(new_special_tokens_dict["additional_special_tokens"]), embedding_size).uniform_(-0.1, 0.1) # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1) print("new_embeddings shape:", new_embeddings.size()) new_embedding_weight = torch.cat( (model.bert.embeddings.word_embeddings.weight.data, new_embeddings), 0) model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) # Update model config vocab size model.config.vocab_size = model.config.vocab_size + len( new_special_tokens_dict["additional_special_tokens"]) else: # Load the tokenizer and model from the save_directory tokenizer = BertTokenizer.from_pretrained(args.save_directory) model = MultiTaskBertForCovidEntityClassification.from_pretrained( args.save_directory) # print(model.state_dict().keys()) # TODO save and load the subtask classifier weights separately # Load from individual state dicts for subtask in model.subtasks: model.classifiers[subtask].load_state_dict( torch.load( os.path.join(args.save_directory, f"{subtask}_classifier.bin"))) # print(model.config) # exit() model.to(device) # Explicitly move the classifiers to device for subtask, classifier in model.classifiers.items(): classifier.to(device) entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0] logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() # Split the data into train, dev and test and shuffle the train segment train_data, dev_data, test_data = split_multitask_instances_in_train_dev_test( data) random.shuffle(train_data) # shuffle happens in-place logging.info("Train Data:") total_train_size, pos_subtasks_train_size, neg_subtasks_train_size = log_multitask_data_statistics( train_data, model.subtasks) logging.info("Dev Data:") total_dev_size, pos_subtasks_dev_size, neg_subtasks_dev_size = log_multitask_data_statistics( dev_data, model.subtasks) logging.info("Test Data:") total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics( test_data, model.subtasks) logging.info("\n") model_config["train_data"] = { "size": total_train_size, "pos": pos_subtasks_train_size, "neg": neg_subtasks_train_size } model_config["dev_data"] = { "size": total_dev_size, "pos": pos_subtasks_dev_size, "neg": neg_subtasks_dev_size } model_config["test_data"] = { "size": total_test_size, "pos": pos_subtasks_test_size, "neg": neg_subtasks_test_size } # Extract subtasks data for dev and test dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks) test_subtasks_data = split_data_based_on_subtasks(test_data, model.subtasks) # Load the instances into pytorch dataset train_dataset = COVID19TaskDataset(train_data) dev_dataset = COVID19TaskDataset(dev_data) test_dataset = COVID19TaskDataset(test_data) logging.info("Loaded the datasets into Pytorch datasets") tokenize_collator = TokenizeCollator(tokenizer, model.subtasks, entity_start_token_id) train_dataloader = DataLoader(train_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=tokenize_collator) dev_dataloader = DataLoader(dev_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) test_dataloader = DataLoader(test_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) logging.info("Created train and test dataloaders with batch aggregation") # Only retrain if needed if args.retrain: print('DO RETRAIN') ################################################################################################## # NOTE: Training Tutorial Reference # https://mccormickml.com/2019/07/22/BERT-fine-tuning/#41-bertforsequenceclassification ################################################################################################## # Create an optimizer training schedule for the BERT text classification model # NOTE: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" # Recommended Schedule for BERT fine-tuning as per the paper # Batch size: 16, 32 # Learning rate (Adam): 5e-5, 3e-5, 2e-5 # Number of epochs: 2, 3, 4 optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) logging.info("Created model optimizer") # Number of training epochs. The BERT authors recommend between 2 and 4. # We chose to run for 4, but we'll see later that this may be over-fitting the # training data. epochs = args.n_epochs # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. # NOTE: num_warmup_steps = 0 is the Default value in run_glue.py scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) # We'll store a number of quantities such as training and validation loss, # validation accuracy, and timings. training_stats = [] logging.info(f"Initiating training loop for {args.n_epochs} epochs...") # Measure the total training time for the whole run. total_start_time = time.time() # Find the accumulation steps accumulation_steps = args.batch_size / POSSIBLE_BATCH_SIZE # Loss trajectory for epochs epoch_train_loss = list() # Dev validation trajectory dev_subtasks_validation_statistics = { subtask: list() for subtask in model.subtasks } for epoch in range(epochs): pbar = tqdm(train_dataloader) logging.info(f"Initiating Epoch {epoch+1}:") # Reset the total loss for each epoch. total_train_loss = 0 train_loss_trajectory = list() # Reset timer for each epoch start_time = time.time() model.train() dev_log_frequency = 5 n_steps = len(train_dataloader) dev_steps = int(n_steps / dev_log_frequency) for step, batch in enumerate(pbar): # Upload labels of each subtask to device for subtask in model.subtasks: subtask_labels = batch["gold_labels"][subtask] subtask_labels = subtask_labels.to(device) # print("HAHAHAHAH:", subtask_labels.is_cuda) batch["gold_labels"][subtask] = subtask_labels # print("HAHAHAHAH:", batch["gold_labels"][subtask].is_cuda) # Forward input_dict = { "input_ids": batch["input_ids"].to(device), "entity_start_positions": batch["entity_start_positions"].to(device), "labels": batch["gold_labels"] } input_ids = batch["input_ids"] entity_start_positions = batch["entity_start_positions"] gold_labels = batch["gold_labels"] batch_data = batch["batch_data"] loss, logits = model(**input_dict) # loss = loss / accumulation_steps # Accumulate loss total_train_loss += loss.item() # Backward: compute gradients loss.backward() if (step + 1) % accumulation_steps == 0: # Calculate elapsed time in minutes and print loss on the tqdm bar elapsed = format_time(time.time() - start_time) avg_train_loss = total_train_loss / (step + 1) # keep track of changing avg_train_loss train_loss_trajectory.append(avg_train_loss) pbar.set_description( f"Epoch:{epoch+1}|Batch:{step}/{len(train_dataloader)}|Time:{elapsed}|Avg. Loss:{avg_train_loss:.4f}|Loss:{loss.item():.4f}" ) # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters optimizer.step() # Clean the model's previous gradients model.zero_grad() # Reset gradients tensors # Update the learning rate. scheduler.step() pbar.update() if (step + 1) % dev_steps == 0: # Perform validation with the model and log the performance logging.info("Running Validation...") # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[ subtask] dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores) logging.info( f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}" ) dev_subtasks_validation_statistics[subtask].append( (epoch + 1, step + 1, dev_TP + dev_FN, dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN)) # logging.info("DEBUG:Validation on Test") # dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task + "_dev", True) # for subtask in model.subtasks: # dev_subtask_data = test_subtasks_data[subtask] # dev_subtask_prediction_scores = dev_prediction_scores[subtask] # dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(dev_subtask_data, dev_subtask_prediction_scores) # logging.info(f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}") # dev_subtasks_validation_statistics[subtask].append((epoch + 1, step + 1, dev_TP + dev_FN, dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN)) # Put the model back in train setting model.train() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) training_time = format_time(time.time() - start_time) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch + 1, 'Training Loss': avg_train_loss, 'Training Time': training_time }) # Save the loss trajectory epoch_train_loss.append(train_loss_trajectory) logging.info( f"Training complete with total Train time:{format_time(time.time()- total_start_time)}" ) log_list(training_stats) # Save the model and the Tokenizer here: logging.info( f"Saving the model and tokenizer in {args.save_directory}") model.save_pretrained(args.save_directory) # Save each subtask classifiers weights to individual state dicts for subtask, classifier in model.classifiers.items(): classifier_save_file = os.path.join(args.save_directory, f"{subtask}_classifier.bin") logging.info( f"Saving the model's {subtask} classifier weights at {classifier_save_file}" ) torch.save(classifier.state_dict(), classifier_save_file) tokenizer.save_pretrained(args.save_directory) # Plot the train loss trajectory in a plot train_loss_trajectory_plot_file = os.path.join( args.output_dir, "train_loss_trajectory.png") logging.info( f"Saving the Train loss trajectory at {train_loss_trajectory_plot_file}" ) plot_train_loss(epoch_train_loss, train_loss_trajectory_plot_file) # TODO: Plot the validation performance # Save dev_subtasks_validation_statistics else: logging.info("No training needed. Directly going to evaluation!") # Save the model name in the model_config file model_config["model"] = "MultiTaskBertForCovidEntityClassification" model_config["epochs"] = args.n_epochs # Find best threshold for each subtask based on dev set performance thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task, True) dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_test_F1s = {subtask: 0.0 for subtask in model.subtasks} best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks} test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} # for subtask in model.subtasks: # test_subtask_data = test_subtasks_data[subtask] # test_subtask_prediction_scores = test_prediction_scores[subtask] # for t in thresholds: # test_F1, test_P, test_R, test_TP, test_FP, test_FN = get_TP_FP_FN(test_subtask_data, test_subtask_prediction_scores, THRESHOLD=t) # test_subtasks_t_F1_P_Rs[subtask].append((t, test_F1, test_P, test_R, test_TP + test_FN, test_TP, test_FP, test_FN)) # if test_F1 > best_test_F1s[subtask]: # best_test_thresholds[subtask] = t # best_test_F1s[subtask] = test_F1 # logging.info(f"Subtask:{subtask:>15}") # log_list(test_subtasks_t_F1_P_Rs[subtask]) # logging.info(f"Best Test Threshold for subtask: {best_test_thresholds[subtask]}\t Best test F1: {best_test_F1s[subtask]}") for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[subtask] for t in thresholds: dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores, THRESHOLD=t) dev_subtasks_t_F1_P_Rs[subtask].append( (t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN)) if dev_F1 > best_dev_F1s[subtask]: best_dev_thresholds[subtask] = t best_dev_F1s[subtask] = dev_F1 logging.info(f"Subtask:{subtask:>15}") log_list(dev_subtasks_t_F1_P_Rs[subtask]) logging.info( f"Best Dev Threshold for subtask: {best_dev_thresholds[subtask]}\t Best dev F1: {best_dev_F1s[subtask]}" ) # Save the best dev threshold and dev_F1 in results dict results["best_dev_threshold"] = best_dev_thresholds results["best_dev_F1s"] = best_dev_F1s results["dev_t_F1_P_Rs"] = dev_subtasks_t_F1_P_Rs # Evaluate on Test logging.info("Testing on test dataset") # test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task) predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task) # Test for subtask in model.subtasks: logging.info(f"Testing the trained classifier on subtask: {subtask}") # print(len(test_dataloader)) # print(len(prediction_scores[subtask])) # print(len(test_subtasks_data[subtask])) results[subtask] = dict() cm = metrics.confusion_matrix(gold_labels[subtask], predicted_labels[subtask]) classification_report = metrics.classification_report( gold_labels[subtask], predicted_labels[subtask], output_dict=True) logging.info(cm) logging.info( metrics.classification_report(gold_labels[subtask], predicted_labels[subtask])) results[subtask]["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results[subtask]["Classification Report"] = classification_report # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation) EM_score, F1_score, total = get_raw_scores(test_subtasks_data[subtask], prediction_scores[subtask]) logging.info("Word overlap based SQuAD evaluation style metrics:") logging.info(f"Total number of cases: {total}") logging.info(f"EM_score: {EM_score}") logging.info(f"F1_score: {F1_score}") results[subtask]["SQuAD_EM"] = EM_score results[subtask]["SQuAD_F1"] = F1_score results[subtask]["SQuAD_total"] = total pos_EM_score, pos_F1_score, pos_total = get_raw_scores( test_subtasks_data[subtask], prediction_scores[subtask], positive_only=True) logging.info(f"Total number of Positive cases: {pos_total}") logging.info(f"Pos. EM_score: {pos_EM_score}") logging.info(f"Pos. F1_score: {pos_F1_score}") results[subtask]["SQuAD_Pos. EM"] = pos_EM_score results[subtask]["SQuAD_Pos. F1"] = pos_F1_score results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total # New evaluation suggested by Alan F1, P, R, TP, FP, FN = get_TP_FP_FN( test_subtasks_data[subtask], prediction_scores[subtask], THRESHOLD=best_dev_thresholds[subtask]) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results[subtask]["F1"] = F1 results[subtask]["P"] = P results[subtask]["R"] = R results[subtask]["TP"] = TP results[subtask]["FP"] = FP results[subtask]["FN"] = FN N = TP + FN results[subtask]["N"] = N # # Top predictions in the Test case # prediction_scores[subtask] = np.array(prediction_scores[subtask]) # sorted_prediction_ids = np.argsort(-prediction_scores[subtask]) # K = 200 # logging.info("Top {} predictions:".format(K)) # logging.info("\t".join(["Tweet", "BERT model input", "candidate chunk", "prediction score", "predicted label", "gold label", "gold chunks"])) # for i in range(K): # instance_id = sorted_prediction_ids[i] # # text :: candidate_chunk :: candidate_chunk_id :: chunk_start_text_id :: chunk_end_text_id :: tokenized_tweet :: tokenized_tweet_with_masked_q_token :: tagged_chunks :: question_label # tweet = test_subtasks_data[subtask][instance_id][0].replace("\n", " ") # chunk = test_subtasks_data[subtask][instance_id][1] # tokenized_tweet_with_masked_chunk = test_subtasks_data[subtask][instance_id][6] # if chunk in ["AUTHOR OF THE TWEET", "NEAR AUTHOR OF THE TWEET"]: # # First element of the text will be considered as AUTHOR OF THE TWEET or NEAR AUTHOR OF THE TWEET # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> </E>") # # print(tokenized_tweet_with_masked_chunk) # # print(bert_model_input_text) # # exit() # else: # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> " + chunk + " </E>") # list_to_print = [tweet, bert_model_input_text, chunk, str(prediction_scores[subtask][instance_id]), str(predicted_labels[subtask][instance_id]), str(test_subtasks_data[subtask][instance_id][-1]), str(test_subtasks_data[subtask][instance_id][-2])] # logging.info("\t".join(list_to_print)) # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)
type=int, default=32) parser.add_argument("-e", "--n_epochs", help="Number of epochs", type=int, default=8) args = parser.parse_args() import logging # Ref: https://stackoverflow.com/a/49202811/4535284 for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) # Also add the stream handler so that it logs on STD out as well # Ref: https://stackoverflow.com/a/46098711/4535284 make_dir_if_not_exists(args.output_dir) if args.retrain: logfile = os.path.join(args.output_dir, "train_output.log") else: logfile = os.path.join(args.output_dir, "output.log") logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.FileHandler(logfile, mode='w'), logging.StreamHandler()]) Q_TOKEN = "<Q_TARGET>" URL_TOKEN = "<URL>" RANDOM_SEED = 901 torch.manual_seed(RANDOM_SEED) POSSIBLE_BATCH_SIZE = 8
def main(): args = parse_args() upsampling_factor = args.upsampling_factor data_path = os.path.join( args.data_path, "x{}".format(upsampling_factor) ) # Create output path out_path = os.path.join( args.out_path, "x{}".format(upsampling_factor) ) utils.make_dir_if_not_exists(out_path) # Shape parameters nchan = args.nchan width_hr = args.tile_size_w height_hr = args.tile_size_h width_lr = width_hr // upsampling_factor height_lr = height_hr // upsampling_factor shape_params = { "gt" : [height_hr, width_hr, nchan], "down" : [height_lr, width_lr, nchan], "bic_up": [height_hr, width_hr, nchan] } """ CREATE DATASET """ batch_size = args.batch_size # Training dataset train_tf_rec_path = os.path.join(data_path, "Training") tfrec_filename_list = utils.get_list_files( train_tf_rec_path, ext='.tfrecords', sort_list=True ) train_tfrec_list = [ os.path.join(train_tf_rec_path, tfrec_filename) for tfrec_filename in tfrec_filename_list ] train_dataset = tf.data.TFRecordDataset(filenames=train_tfrec_list) train_dataset = train_dataset.shuffle(args.shuffle) train_dataset = train_dataset.map( lambda x: parse_record(x, shape_params), num_parallel_calls=4 ) train_dataset = train_dataset.batch(batch_size, drop_remainder=True) # Validation dataset val_tf_rec_path = os.path.join(data_path, "Validation") val_tfrec_filename_list = utils.get_list_files( val_tf_rec_path, ext='.tfrecords', sort_list=True ) val_tfrec_list = [ os.path.join(val_tf_rec_path, tfrec_filename) for tfrec_filename in val_tfrec_filename_list ] val_dataset = tf.data.TFRecordDataset(filenames=val_tfrec_list) val_dataset = val_dataset.shuffle(args.shuffle) val_dataset = val_dataset.map( lambda x: parse_record(x, shape_params), num_parallel_calls=4 ) val_dataset = val_dataset.batch(batch_size, drop_remainder=True) # Create a reinitializable iterator of the correct shape and type iterator = tf.data.Iterator.from_structure( train_dataset.output_types, train_dataset.output_shapes ) next_element = iterator.get_next() # Create the initialization operations train_data_init_op = iterator.make_initializer(train_dataset) val_data_init_op = iterator.make_initializer(val_dataset) # Read the mean values of our dataset mean_val_path = os.path.join( data_path, "mean_values.pickle" ) with open(mean_val_path, "rb") as fid: mean_values = pickle.load(fid) mean_val_gt = mean_values["gt"] mean_val_down = mean_values["down"] mean_val_bic_up = mean_values["bic_up"] """ BUILD THE MODEL """ # Graph Model params_superres_net = { "num_FB_layers" : args.num_FB_layers, "num_dist_blocks" : args.num_Dist_blocks, "upsampling_factor": upsampling_factor } input_superres_net = { "tf_init" : next_element["down"], "tf_upsampled": next_element["bic_up"], } tf_superres, weights_conv_list, bias_list = build_model( params_superres_net, input_superres_net ) """ TRAINING OPERATORS """ # SSIM operators ssim_gt_res_op = tf.image.ssim(next_element["gt"], tf_superres, max_val=1.0) ssim_gt_res_op = tf.reduce_mean(ssim_gt_res_op) # Loss operator if args.ssim_loss: loss_op = -1.0 * ssim_gt_res_op else: loss_op = tf.losses.absolute_difference( next_element["gt"], tf_superres ) # Weight decay loss_decay = [tf.nn.l2_loss(w) for w in weights_conv_list] loss_decay = args.weight_decay * tf.add_n(loss_decay) loss_op += loss_decay # Create the optimizer optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) train_op = optimizer.minimize(loss_op) # Create summaries num_img_viz = args.num_img_viz summary_train_loss_ph = tf.placeholder(tf.float32, name="summary_train_loss_ph") summary_val_loss_ph = tf.placeholder(tf.float32, name="summary_val_loss_ph") summary_train_ssim_gt_res_ph = tf.placeholder(tf.float32, name="summary_train_ssim_gt_res_ph") summary_val_ssim_gt_res_ph = tf.placeholder(tf.float32, name="summary_val_ssim_gt_res_ph") summary_train_gt_ph = tf.placeholder(tf.float32, [None, None, None, nchan], name="summary_train_gt") summary_train_result_ph = tf.placeholder(tf.float32, [None, None, None, nchan], name="summary_train_result") summary_train_bic_up_ph = tf.placeholder(tf.float32, [None, None, None, nchan], name="summary_train_bic_up") summary_train_residual_gt_res_ph = tf.placeholder( tf.float32, [None, None, None, 3], name="summary_train_residual_gt_res_ph" ) summary_train_residual_bicup_res_ph = tf.placeholder( tf.float32, [None, None, None, 3], name="summary_train_residual_bicup_res_ph" ) summary_val_gt_ph = tf.placeholder(tf.float32, [None, None, None, nchan], name="summary_val_gt") summary_val_result_ph = tf.placeholder(tf.float32, [None, None, None, nchan], name="summary_val_result") summary_val_bic_up_ph = tf.placeholder(tf.float32, [None, None, None, nchan], name="summary_val_bic_up") summary_val_residual_gt_res_ph = tf.placeholder( tf.float32, [None, None, None, 3], name="summary_val_residual_gt_res_ph" ) summary_val_residual_bicup_res_ph = tf.placeholder( tf.float32, [None, None, None, 3], name="summary_val_residual_bicup_res_ph" ) tf.summary.scalar("01_train_loss", summary_train_loss_ph) tf.summary.scalar("02_train_ssim_gt_res", summary_train_ssim_gt_res_ph) tf.summary.scalar("12_val_loss", summary_val_loss_ph) tf.summary.scalar("12_val_ssim_gt_res", summary_val_ssim_gt_res_ph) tf.summary.image ('00_train_gt' , summary_train_gt_ph , num_img_viz) tf.summary.image ('01_train_bic_up', summary_train_bic_up_ph, num_img_viz) tf.summary.image ('02_train_result', summary_train_result_ph, num_img_viz) tf.summary.image ( '03_train_residual_gt_res', summary_train_residual_gt_res_ph, num_img_viz ) tf.summary.image ( '04_train_residual_bicup_res', summary_train_residual_bicup_res_ph, num_img_viz ) tf.summary.image ('10_val_gt' , summary_val_gt_ph , num_img_viz) tf.summary.image ('11_val_bic_up', summary_val_bic_up_ph, num_img_viz) tf.summary.image ('12_val_result', summary_val_result_ph, num_img_viz) tf.summary.image ( '13_val_residual_gt_res', summary_val_residual_gt_res_ph, num_img_viz ) tf.summary.image ( '14_val_residual_bicup_res', summary_val_residual_bicup_res_ph, num_img_viz ) summary_op = tf.summary.merge_all() # Model and Train Saver with tf.name_scope("Train_Saver"): train_saver = tf.train.Saver( max_to_keep=5, keep_checkpoint_every_n_hours=2, save_relative_paths=True, pad_step_number=True ) """ TRAINING SESSION """ with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Create Summary log_path = os.path.join(out_path, "Log") log_writer = tf.summary.FileWriter(log_path) log_writer.add_graph(sess.graph) # Savers checkpoint_path = os.path.join(out_path, "ckpts") utils.make_dir_if_not_exists(checkpoint_path) train_saver.save( sess, os.path.join(checkpoint_path, "initial"), write_meta_graph=True ) # Restore the model or if necessary if args.restore: restore_model( args.pretrained_path, upsampling_factor, train_saver, sess ) # Start training for epoch in range(args.nepochs): print("Epoch {}".format(epoch)) """ TRAINING """ sess.run(train_data_init_op) num_batches = 0 loss_train_mean = 0 ssim_gt_res_train_mean = 0 while True: try: # Run model ( gt_value_train, bic_up_value_train, result_value_train, ssim_gt_res_value, loss_value, _ ) = sess.run( [ next_element["gt"], next_element["bic_up"], tf_superres, ssim_gt_res_op, loss_op, train_op, ] ) # Update values num_batches += 1 loss_train_mean += loss_value ssim_gt_res_train_mean += ssim_gt_res_value except tf.errors.OutOfRangeError: break # Update mean values loss_train_mean /= num_batches ssim_gt_res_train_mean /= num_batches # Arrange images ot be visualized in tensorboard gt_value_train += mean_val_gt bic_up_value_train += mean_val_bic_up result_value_train += mean_val_gt result_value_train = np.maximum(0.0, np.minimum(result_value_train, 1.0)) # Residual residual_gt_result_train = gt_value_train - result_value_train residual_bicup_result_train = bic_up_value_train - result_value_train min_colorscale = min( np.min(residual_gt_result_train), np.min(residual_bicup_result_train) ) max_colorscale = max( np.max(residual_gt_result_train), np.max(residual_bicup_result_train) ) residual_bicup_result_train = colorize( residual_bicup_result_train, min_colorscale, max_colorscale, cmap='jet' ) residual_gt_result_train = colorize( residual_gt_result_train, min_colorscale, max_colorscale, cmap='jet' ) """ VALIDATION """ sess.run(val_data_init_op) num_batches = 0 loss_val_mean = 0 ssim_gt_res_val_mean = 0 while True: try: # Run model ( gt_value_val, bic_up_value_val, result_value_val, ssim_gt_res_value, loss_value, ) = sess.run( [ next_element["gt"], next_element["bic_up"], tf_superres, ssim_gt_res_op, loss_op, ] ) # Update values num_batches += 1 loss_val_mean += loss_value ssim_gt_res_val_mean += ssim_gt_res_value except tf.errors.OutOfRangeError: break # Update mean values loss_val_mean /= num_batches ssim_gt_res_val_mean /= num_batches # Arrange images ot be visualized in tensorboard gt_value_val += mean_val_gt bic_up_value_val += mean_val_bic_up result_value_val += mean_val_gt result_value_val = np.maximum(0.0, np.minimum(result_value_val, 1.0)) # Residual residual_gt_result_val = gt_value_val - result_value_val residual_bicup_result_val = bic_up_value_val - result_value_val min_colorscale = min( np.min(residual_gt_result_val), np.min(residual_bicup_result_val) ) max_colorscale = max( np.max(residual_gt_result_val), np.max(residual_bicup_result_val) ) residual_bicup_result_val = colorize( residual_bicup_result_val, min_colorscale, max_colorscale, cmap='jet' ) residual_gt_result_val = colorize( residual_gt_result_val, min_colorscale, max_colorscale, cmap='jet' ) # Write out summaries summary = sess.run( summary_op, feed_dict={ summary_train_loss_ph: loss_train_mean, summary_train_ssim_gt_res_ph: ssim_gt_res_train_mean, summary_train_gt_ph : gt_value_train, summary_train_bic_up_ph: bic_up_value_train, summary_train_result_ph: result_value_train, summary_train_residual_gt_res_ph: residual_gt_result_train, summary_train_residual_bicup_res_ph: residual_bicup_result_train, summary_val_loss_ph: loss_val_mean, summary_val_ssim_gt_res_ph: ssim_gt_res_val_mean, summary_val_gt_ph : gt_value_val, summary_val_bic_up_ph: bic_up_value_val, summary_val_result_ph: result_value_val, summary_val_residual_gt_res_ph: residual_gt_result_val, summary_val_residual_bicup_res_ph: residual_bicup_result_val, } ) log_writer.add_summary(summary, epoch) # Save a checkpoint train_saver.save( sess, os.path.join(checkpoint_path, "checkpoint"), global_step=epoch, write_meta_graph=False ) # Save the final model train_saver.save( sess, os.path.join(checkpoint_path, "final"), write_meta_graph=True )
for seg in range(0, ns): i = seg / ns_per_dim j = seg % ns_per_dim z_bot = z_bot.view(batch_size, 32, 8, 8) z_volume = z_bot[:, :, i * 4:(i + 1) * 4, j * 4:(j + 1) * 4] x_seg_high_rec = gen_bot(z_volume, give_pre=True) fake_images[:, :, i * seg_length:(i + 1) * seg_length, j * seg_length:(j + 1) * seg_length] = x_seg_high_rec.data # reconstruction_loss = ((rec_images_top - images.cpu().data)**2).mean() # print 'high level reconstruction_loss:', reconstruction_loss elapsed = timer() - start_time if elapsed > checkpoint_i * CHECKPOINT_INTERVAL: print 'Writing images and checkpoints' make_dir_if_not_exists(OUT_DIR) make_dir_if_not_exists(MODELS_DIR) save_image( denorm(rec_images_bot), os.path.join(OUT_DIR, 'rec_bot_images%04d.png' % checkpoint_i)) save_image( denorm(fake_images), os.path.join(OUT_DIR, 'fake_images%04d.png' % checkpoint_i)) save_image( denorm(images.data), os.path.join(OUT_DIR, 'real_images%04d.png' % checkpoint_i)) save_image( denorm(rec_full_bot), os.path.join(OUT_DIR, 'rec_full_bot_images%04d.png' % checkpoint_i))
# Initialize tokenizer and model with pretrained weights tokenizer = BertTokenizer.from_pretrained(model_name) # Add new tokens in tokenizer new_special_tokens_dict = {"additional_special_tokens": ["<user>"]} tokenizer.add_special_tokens(new_special_tokens_dict) dataset = COVID19SentDataset(args.data_file, 0.7, tokenizer) print(len(dataset.train_dataset), dataset.train_dataset[0]) print(len(dataset.dev_dataset), dataset.dev_dataset[0]) print(len(dataset.train_dataset[0][0]), dataset.train_dataset[0][1].shape) [print(k, v.shape) for k, v in dataset.train_dataset[0][2].items()] save_dir = "./" + args.run + "_" + args.task make_dir_if_not_exists(save_dir) config = BertConfig.from_pretrained(model_name) config.subtasks = DO_TASKS model = SentBert.from_pretrained(model_name, config=config) # Add the new embeddings in the weights print("Embeddings type:", model.bert.embeddings.word_embeddings.weight.data.type()) print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) embedding_size = model.bert.embeddings.word_embeddings.weight.size(1) new_embeddings = torch.FloatTensor( len(new_special_tokens_dict["additional_special_tokens"]), embedding_size).uniform_(-0.1, 0.1) print("new_embeddings shape:", new_embeddings.size())
def main(): # Read all the data instances task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data, subtasks_list = get_multitask_instances_for_valid_tasks( task_instances_dict, tag_statistics) if args.retrain: logging.info("Creating and training the model from 'bert-base-cased' ") # Create the save_directory if not exists make_dir_if_not_exists(args.save_directory) # Initialize tokenizer and model with pretrained weights tokenizer = BertTokenizer.from_pretrained('bert-base-cased') config = BertConfig.from_pretrained('bert-base-cased') config.subtasks = subtasks_list # print(config) model = MultiTaskBertForCovidEntityClassification.from_pretrained( 'bert-base-cased', config=config) # Add new tokens in tokenizer new_special_tokens_dict = { "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"] } # new_special_tokens_dict = {"additional_special_tokens": ["<E>", "</E>"]} tokenizer.add_special_tokens(new_special_tokens_dict) # Add the new embeddings in the weights print("Embeddings type:", model.bert.embeddings.word_embeddings.weight.data.type()) print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) embedding_size = model.bert.embeddings.word_embeddings.weight.size(1) new_embeddings = torch.FloatTensor( len(new_special_tokens_dict["additional_special_tokens"]), embedding_size).uniform_(-0.1, 0.1) # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1) print("new_embeddings shape:", new_embeddings.size()) new_embedding_weight = torch.cat( (model.bert.embeddings.word_embeddings.weight.data, new_embeddings), 0) model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) # Update model config vocab size model.config.vocab_size = model.config.vocab_size + len( new_special_tokens_dict["additional_special_tokens"]) else: # Load the tokenizer and model from the save_directory tokenizer = BertTokenizer.from_pretrained(args.save_directory) model = MultiTaskBertForCovidEntityClassification.from_pretrained( args.save_directory) # print(model.state_dict().keys()) # TODO save and load the subtask classifier weights separately # Load from individual state dicts for subtask in model.subtasks: model.classifiers[subtask].load_state_dict( torch.load( os.path.join(args.save_directory, f"{subtask}_classifier.bin"))) # print(model.config) # exit() model.to(device) # Explicitly move the classifiers to device for subtask, classifier in model.classifiers.items(): classifier.to(device) entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0] logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() test_data = data logging.info("Test Data:") total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics( test_data, model.subtasks) logging.info("\n") # model_config["train_data"] = {"size":total_train_size, "pos":pos_subtasks_train_size, "neg":neg_subtasks_train_size} # model_config["dev_data"] = {"size":total_dev_size, "pos":pos_subtasks_dev_size, "neg":neg_subtasks_dev_size} model_config["test_data"] = { "size": total_test_size, "pos": pos_subtasks_test_size, "neg": neg_subtasks_test_size } # Extract subtasks data for dev and test #dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks) test_subtasks_data = split_data_based_on_subtasks(test_data, model.subtasks) # Load the instances into pytorch dataset # train_dataset = COVID19TaskDataset(train_data) # dev_dataset = COVID19TaskDataset(dev_data) test_dataset = COVID19TaskDataset(test_data) logging.info("Loaded the datasets into Pytorch datasets") tokenize_collator = TokenizeCollator(tokenizer, model.subtasks, entity_start_token_id) # train_dataloader = DataLoader(train_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=tokenize_collator) # dev_dataloader = DataLoader(dev_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) test_dataloader = DataLoader(test_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) logging.info("Created train and test dataloaders with batch aggregation") # Save the model name in the model_config file model_config["model"] = "MultiTaskBertForCovidEntityClassification" model_config["epochs"] = args.n_epochs # Find best threshold for each subtask based on dev set performance thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task, True) #dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(dev_dataloader, model, device, args.task + "_dev", True) best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_test_F1s = {subtask: 0.0 for subtask in model.subtasks} best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks} test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} # Evaluate on Test logging.info("Testing on test dataset") # test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task) predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task) # Test for subtask in model.subtasks: logging.info(f"Testing the trained classifier on subtask: {subtask}") # print(len(test_dataloader)) # print(len(prediction_scores[subtask])) # print(len(test_subtasks_data[subtask])) results[subtask] = dict() cm = metrics.confusion_matrix(gold_labels[subtask], predicted_labels[subtask]) classification_report = metrics.classification_report( gold_labels[subtask], predicted_labels[subtask], output_dict=True) logging.info(cm) logging.info( metrics.classification_report(gold_labels[subtask], predicted_labels[subtask])) results[subtask]["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results[subtask]["Classification Report"] = classification_report # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation) EM_score, F1_score, total = get_raw_scores(test_subtasks_data[subtask], prediction_scores[subtask]) logging.info("Word overlap based SQuAD evaluation style metrics:") logging.info(f"Total number of cases: {total}") logging.info(f"EM_score: {EM_score}") logging.info(f"F1_score: {F1_score}") results[subtask]["SQuAD_EM"] = EM_score results[subtask]["SQuAD_F1"] = F1_score results[subtask]["SQuAD_total"] = total pos_EM_score, pos_F1_score, pos_total = get_raw_scores( test_subtasks_data[subtask], prediction_scores[subtask], positive_only=True) logging.info(f"Total number of Positive cases: {pos_total}") logging.info(f"Pos. EM_score: {pos_EM_score}") logging.info(f"Pos. F1_score: {pos_F1_score}") results[subtask]["SQuAD_Pos. EM"] = pos_EM_score results[subtask]["SQuAD_Pos. F1"] = pos_F1_score results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total # New evaluation suggested by Alan F1, P, R, TP, FP, FN = get_TP_FP_FN( test_subtasks_data[subtask], prediction_scores[subtask], THRESHOLD=best_dev_thresholds[subtask]) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results[subtask]["F1"] = F1 results[subtask]["P"] = P results[subtask]["R"] = R results[subtask]["TP"] = TP results[subtask]["FP"] = FP results[subtask]["FN"] = FN N = TP + FN results[subtask]["N"] = N # # Top predictions in the Test case # prediction_scores[subtask] = np.array(prediction_scores[subtask]) # sorted_prediction_ids = np.argsort(-prediction_scores[subtask]) # K = 200 # logging.info("Top {} predictions:".format(K)) # logging.info("\t".join(["Tweet", "BERT model input", "candidate chunk", "prediction score", "predicted label", "gold label", "gold chunks"])) # for i in range(K): # instance_id = sorted_prediction_ids[i] # # text :: candidate_chunk :: candidate_chunk_id :: chunk_start_text_id :: chunk_end_text_id :: tokenized_tweet :: tokenized_tweet_with_masked_q_token :: tagged_chunks :: question_label # tweet = test_subtasks_data[subtask][instance_id][0].replace("\n", " ") # chunk = test_subtasks_data[subtask][instance_id][1] # tokenized_tweet_with_masked_chunk = test_subtasks_data[subtask][instance_id][6] # if chunk in ["AUTHOR OF THE TWEET", "NEAR AUTHOR OF THE TWEET"]: # # First element of the text will be considered as AUTHOR OF THE TWEET or NEAR AUTHOR OF THE TWEET # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> </E>") # # print(tokenized_tweet_with_masked_chunk) # # print(bert_model_input_text) # # exit() # else: # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> " + chunk + " </E>") # list_to_print = [tweet, bert_model_input_text, chunk, str(prediction_scores[subtask][instance_id]), str(predicted_labels[subtask][instance_id]), str(test_subtasks_data[subtask][instance_id][-1]), str(test_subtasks_data[subtask][instance_id][-2])] # logging.info("\t".join(list_to_print)) # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)
import model as model import tcav as tcav import utils as utils import utils_plot as utils_plot # utils_plot requires matplotlib import os import torch import activation_generator as act_gen import tensorflow as tf working_dir = './tcav_class_test' activation_dir = working_dir + '/activations/' cav_dir = working_dir + '/cavs/' source_dir = "./data/" bottlenecks = ['conv2'] utils.make_dir_if_not_exists(activation_dir) utils.make_dir_if_not_exists(working_dir) utils.make_dir_if_not_exists(cav_dir) # this is a regularizer penalty parameter for linear classifier to get CAVs. alphas = [0.1] target = 'cat' concepts = ["dotted", "striped", "zigzagged"] random_counterpart = 'random500_1' LABEL_PATH = './data/imagenet_comp_graph_label_strings.txt' mymodel = model.CNNWrapper(LABEL_PATH) act_generator = act_gen.ImageActivationGenerator(mymodel, source_dir, activation_dir, max_examples=100)
def main(): # Read all the data instances task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data, subtasks_list = get_multitask_instances_for_valid_tasks( task_instances_dict, tag_statistics) data = add_marker_for_loss_ignore( data, 1.0 if args.loss_for_no_consensus else 0.0) if args.retrain: if args.large_bert: model_name = "bert-large-cased" elif args.covid_bert: model_name = "digitalepidemiologylab/covid-twitter-bert" else: model_name = "bert-base-cased" logging.info("Creating and training the model from '" + model_name + "'") # Create the save_directory if not exists make_dir_if_not_exists(args.save_directory) # Initialize tokenizer and model with pretrained weights tokenizer = BertTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name) config.subtasks = subtasks_list model = MultiTaskBertForCovidEntityClassification.from_pretrained( model_name, config=config) # Add new tokens in tokenizer new_special_tokens_dict = { "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"] } tokenizer.add_special_tokens(new_special_tokens_dict) # Add the new embeddings in the weights print("Embeddings type:", model.bert.embeddings.word_embeddings.weight.data.type()) print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) embedding_size = model.bert.embeddings.word_embeddings.weight.size(1) new_embeddings = torch.FloatTensor( len(new_special_tokens_dict["additional_special_tokens"]), embedding_size).uniform_(-0.1, 0.1) # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1) print("new_embeddings shape:", new_embeddings.size()) new_embedding_weight = torch.cat( (model.bert.embeddings.word_embeddings.weight.data, new_embeddings), 0) model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) # Update model config vocab size model.config.vocab_size = model.config.vocab_size + len( new_special_tokens_dict["additional_special_tokens"]) else: # Load the tokenizer and model from the save_directory tokenizer = BertTokenizer.from_pretrained(args.save_directory) model = MultiTaskBertForCovidEntityClassification.from_pretrained( args.save_directory) # Load from individual state dicts for subtask in model.subtasks: model.classifiers[subtask].load_state_dict( torch.load( os.path.join(args.save_directory, f"{subtask}_classifier.bin"))) model.to(device) if args.wandb: wandb.watch(model) # Explicitly move the classifiers to device for subtask, classifier in model.classifiers.items(): classifier.to(device) for subtask, classifier in model.context_vectors.items(): classifier.to(device) entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0] entity_end_token_id = tokenizer.convert_tokens_to_ids(["</E>"])[0] logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() # Split the data into train, dev and test and shuffle the train segment train_data, dev_data = split_multitask_instances_in_train_dev(data) random.shuffle(train_data) # shuffle happens in-place logging.info("Train Data:") total_train_size, pos_subtasks_train_size, neg_subtasks_train_size = log_multitask_data_statistics( train_data, model.subtasks) logging.info("Dev Data:") total_dev_size, pos_subtasks_dev_size, neg_subtasks_dev_size = log_multitask_data_statistics( dev_data, model.subtasks) #logging.info("Test Data:") #total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics(test_data, model.subtasks) logging.info("\n") model_config["train_data"] = { "size": total_train_size, "pos": pos_subtasks_train_size, "neg": neg_subtasks_train_size } model_config["dev_data"] = { "size": total_dev_size, "pos": pos_subtasks_dev_size, "neg": neg_subtasks_dev_size } #model_config["test_data"] = {"size":total_test_size, "pos":pos_subtasks_test_size, "neg":neg_subtasks_test_size} # Extract subtasks data for dev and test train_subtasks_data = split_data_based_on_subtasks(train_data, model.subtasks) dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks) #test_subtasks_data = split_data_based_on_subtasks(test_data, model.subtasks) # Load the instances into pytorch dataset train_dataset = COVID19TaskDataset(train_data) dev_dataset = COVID19TaskDataset(dev_data) #test_dataset = COVID19TaskDataset(test_data) logging.info("Loaded the datasets into Pytorch datasets") tokenize_collator = TokenizeCollator(tokenizer, model.subtasks, entity_start_token_id, entity_end_token_id) train_dataloader = DataLoader(train_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=tokenize_collator) dev_dataloader = DataLoader(dev_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) #test_dataloader = DataLoader(test_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) logging.info("Created train and test dataloaders with batch aggregation") # Only retrain if needed if args.retrain: optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) logging.info("Created model optimizer") #if args.sentence_level_classify: # args.n_epochs += 2 epochs = args.n_epochs # Total number of training steps is [number of batches] x [number of epochs]. total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. # NOTE: num_warmup_steps = 0 is the Default value in run_glue.py scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) # We'll store a number of quantities such as training and validation loss, validation accuracy, and timings. training_stats = [] print("\n\n\n ====== Training for task", args.task, "=============\n\n\n") logging.info(f"Initiating training loop for {args.n_epochs} epochs...") print(model.state_dict().keys()) total_start_time = time.time() # Find the accumulation steps accumulation_steps = args.batch_size / POSSIBLE_BATCH_SIZE # Dev validation trajectory epoch_train_loss = list() train_subtasks_validation_statistics = { subtask: list() for subtask in model.subtasks } dev_subtasks_validation_statistics = { subtask: list() for subtask in model.subtasks } best_dev_F1 = 0 for epoch in range(epochs): logging.info(f"Initiating Epoch {epoch+1}:") # Reset the total loss for each epoch. total_train_loss = 0 train_loss_trajectory = list() # Reset timer for each epoch start_time = time.time() model.train() dev_log_frequency = 5 n_steps = len(train_dataloader) dev_steps = int(n_steps / dev_log_frequency) for step, batch in enumerate(train_dataloader): # Upload labels of each subtask to device for subtask in model.subtasks: subtask_labels = batch["gold_labels"][subtask] subtask_labels = subtask_labels.to(device) batch["gold_labels"][subtask] = subtask_labels batch["label_ignore_loss"][subtask] = batch[ "label_ignore_loss"][subtask].to(device) # Forward input_dict = { "input_ids": batch["input_ids"].to(device), "entity_start_positions": batch["entity_start_positions"].to(device), "entity_end_positions": batch["entity_end_positions"].to(device), "labels": batch["gold_labels"], "label_weight": batch["label_ignore_loss"] } input_ids = batch["input_ids"] entity_start_positions = batch["entity_start_positions"] gold_labels = batch["gold_labels"] batch_data = batch["batch_data"] loss, logits = model(**input_dict) # Accumulate loss total_train_loss += loss.item() # Backward: compute gradients loss.backward() if (step + 1) % accumulation_steps == 0: # Calculate elapsed time in minutes and print loss on the tqdm bar elapsed = format_time(time.time() - start_time) avg_train_loss = total_train_loss / (step + 1) # keep track of changing avg_train_loss train_loss_trajectory.append(avg_train_loss) if (step + 1) % (accumulation_steps * 20) == 0: print( f"Epoch:{epoch+1}|Batch:{step}/{len(train_dataloader)}|Time:{elapsed}|Avg. Loss:{avg_train_loss:.4f}|Loss:{loss.item():.4f}" ) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() # Clean the model's previous gradients model.zero_grad() scheduler.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) # Perform validation with the model and log the performance print("\n") logging.info("Running Validation...") # Put the model in evaluation mode--the dropout layers behave differently during evaluation. model.eval() dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) wandb_log_dict = {"Train Loss": avg_train_loss} print("Dev Set:") collect_TP_FP_FN = {"TP": 0, "FP": 0, "FN": 0} for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[subtask] dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores, task=subtask) if subtask not in IGNORE_TASKS: collect_TP_FP_FN["TP"] += dev_TP collect_TP_FP_FN["FP"] += dev_FP collect_TP_FP_FN["FN"] += dev_FN else: print("IGNORE: ", end="") print( f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}" ) dev_subtasks_validation_statistics[subtask].append( (epoch + 1, step + 1, dev_TP + dev_FN, dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN)) wandb_log_dict["Dev_ " + subtask + "_F1"] = dev_F1 wandb_log_dict["Dev_ " + subtask + "_P"] = dev_P wandb_log_dict["Dev_ " + subtask + "_R"] = dev_R dev_macro_P = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] + collect_TP_FP_FN["FP"]) dev_macro_R = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] + collect_TP_FP_FN["FN"]) dev_macro_F1 = (2 * dev_macro_P * dev_macro_R) / (dev_macro_P + dev_macro_R) print(collect_TP_FP_FN) print("dev_macro_P:", dev_macro_P, "\ndev_macro_R:", dev_macro_R, "\ndev_macro_F1:", dev_macro_F1, "\n") wandb_log_dict["Dev_macro_F1"] = dev_macro_F1 wandb_log_dict["Dev_macro_P"] = dev_macro_P wandb_log_dict["Dev_macro_R"] = dev_macro_R if args.wandb: wandb.log(wandb_log_dict) if dev_macro_F1 > best_dev_F1: best_dev_F1 = dev_macro_F1 print("NEW BEST F1:", best_dev_F1, " Saving checkpoint now.") torch.save(model.state_dict(), args.output_dir + "/ckpt.pth") #print(model.state_dict().keys()) #model.save_pretrained(args.save_directory) model.train() training_time = format_time(time.time() - start_time) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch + 1, 'Training Loss': avg_train_loss, 'Training Time': training_time }) # Save the loss trajectory epoch_train_loss.append(train_loss_trajectory) print("\n\n") logging.info( f"Training complete with total Train time:{format_time(time.time()- total_start_time)}" ) log_list(training_stats) model.load_state_dict(torch.load(args.output_dir + "/ckpt.pth")) model.eval() # Save the model and the Tokenizer here: #logging.info(f"Saving the model and tokenizer in {args.save_directory}") #model.save_pretrained(args.save_directory) # Save each subtask classifiers weights to individual state dicts #for subtask, classifier in model.classifiers.items(): # classifier_save_file = os.path.join(args.save_directory, f"{subtask}_classifier.bin") # logging.info(f"Saving the model's {subtask} classifier weights at {classifier_save_file}") # torch.save(classifier.state_dict(), classifier_save_file) #tokenizer.save_pretrained(args.save_directory) # Plot the train loss trajectory in a plot #train_loss_trajectory_plot_file = os.path.join(args.output_dir, "train_loss_trajectory.png") #logging.info(f"Saving the Train loss trajectory at {train_loss_trajectory_plot_file}") #print(epoch_train_loss) # TODO: Plot the validation performance # Save dev_subtasks_validation_statistics else: raise logging.info("No training needed. Directly going to evaluation!") # Save the model name in the model_config file model_config["model"] = "MultiTaskBertForCovidEntityClassification" model_config["epochs"] = args.n_epochs # Find best threshold for each subtask based on dev set performance thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] #test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task, True) dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_test_F1s = {subtask: 0.0 for subtask in model.subtasks} best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks} #test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[subtask] for t in thresholds: dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores, THRESHOLD=t, task=subtask) dev_subtasks_t_F1_P_Rs[subtask].append( (t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN)) if dev_F1 > best_dev_F1s[subtask]: best_dev_thresholds[subtask] = t best_dev_F1s[subtask] = dev_F1 logging.info(f"Subtask:{subtask:>15}") log_list(dev_subtasks_t_F1_P_Rs[subtask]) logging.info( f"Best Dev Threshold for subtask: {best_dev_thresholds[subtask]}\t Best dev F1: {best_dev_F1s[subtask]}" ) # Save the best dev threshold and dev_F1 in results dict results["best_dev_threshold"] = best_dev_thresholds results["best_dev_F1s"] = best_dev_F1s results["dev_t_F1_P_Rs"] = dev_subtasks_t_F1_P_Rs # Evaluate on Test logging.info("Testing on eval dataset") predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task) # Test for subtask in model.subtasks: logging.info(f"\nTesting the trained classifier on subtask: {subtask}") results[subtask] = dict() cm = metrics.confusion_matrix(gold_labels[subtask], predicted_labels[subtask]) classification_report = metrics.classification_report( gold_labels[subtask], predicted_labels[subtask], output_dict=True) logging.info(cm) logging.info( metrics.classification_report(gold_labels[subtask], predicted_labels[subtask])) results[subtask]["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results[subtask]["Classification Report"] = classification_report # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation) EM_score, F1_score, total = get_raw_scores(dev_subtasks_data[subtask], prediction_scores[subtask]) logging.info("Word overlap based SQuAD evaluation style metrics:") logging.info(f"Total number of cases: {total}") logging.info(f"EM_score: {EM_score}") logging.info(f"F1_score: {F1_score}") results[subtask]["SQuAD_EM"] = EM_score results[subtask]["SQuAD_F1"] = F1_score results[subtask]["SQuAD_total"] = total pos_EM_score, pos_F1_score, pos_total = get_raw_scores( dev_subtasks_data[subtask], prediction_scores[subtask], positive_only=True) logging.info(f"Total number of Positive cases: {pos_total}") logging.info(f"Pos. EM_score: {pos_EM_score}") logging.info(f"Pos. F1_score: {pos_F1_score}") results[subtask]["SQuAD_Pos. EM"] = pos_EM_score results[subtask]["SQuAD_Pos. F1"] = pos_F1_score results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total # New evaluation suggested by Alan F1, P, R, TP, FP, FN = get_TP_FP_FN( dev_subtasks_data[subtask], prediction_scores[subtask], THRESHOLD=best_dev_thresholds[subtask], task=subtask) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results[subtask]["F1"] = F1 results[subtask]["P"] = P results[subtask]["R"] = R results[subtask]["TP"] = TP results[subtask]["FP"] = FP results[subtask]["FN"] = FN N = TP + FN results[subtask]["N"] = N # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)
ttag, ttype = m.groups() if len(ttype) > 0 and ttype[0] == "-": ttype = ttype[1:] current.append((token, ttag, ttype)) # process leftovers, if any if len(current) > 0: sentences.append(current) # print(sentences) if len(sentences) > 0: output(fn, output_directory, sentences) if __name__ == "__main__": conll_format_test_files = "Conll_Outputs/" list_of_test_files = utils.Read_Files_in_Input_Folder( conll_format_test_files) standoff_output_directory = "Standoff_Outputs/" utils.make_dir_if_not_exists(standoff_output_directory) for file_name in list_of_test_files: file_values = file_name.split("/") protocol_name = file_values[-1] output_directory = standoff_output_directory process(file_name, output_directory) # # for line in open(file): # # print(line)
dfs.append(indicator_data) stock_indicators_joined = reduce( lambda left, right: pd.merge( left, right, left_index=True, right_index=True, how='outer' ), dfs) stock_indicators_joined.index.name = 'date' # print(stock_indicators_joined) print('fetched and joined data for ' + stock) formatted_output_path = utils.format_path(output_path) utils.make_dir_if_not_exists(output_path) stock_indicators_joined.to_csv( formatted_output_path + '/' + stock + '.csv') print('saved csv file to ' + formatted_output_path + '/' + stock + '.csv') elapsed = time.time() - start print('time elapsed: ' + str(round(elapsed, 2)) + " seconds") if __name__ == '__main__': fetch(str(sys.argv[1]), str(sys.argv[2]), str(sys.argv[3]))
print "train with less g loss bot" # Generator loss pushing generated x's toward boundary g_loss_bot = 1.0 * ((d_out_bot - boundary_labels)**2).mean() gen_bot.zero_grad() d_bot.zero_grad() g_loss_bot.backward(retain_graph=True) # Only update generator 10% of time # But still backprop every time? if random.uniform(0, 1) < 0.1: gen_bot_optimizer.step() #print d_out_bot make_dir_if_not_exists(EXP_DIR) # Log z norms z_bot_norm = map(torch_to_norm, z_bot_lst) z_bot_norms.append(max(z_bot_norm)) z_top_norm = torch_to_norm(z_top) z_top_norms.append(z_top_norm) d = {'z_bot_norms': z_bot_norms, 'z_top_norms': z_top_norms} with open(os.path.join(EXP_DIR, 'z_norms.pkl'), 'wb') as f: pickle.dump(d, f) fake_images = torch.cat(gen_x_lst, 1) fake_images = fake_images.view(fake_images.size(0), 1, 28, 28) save_image(denorm(fake_images.data), os.path.join(EXP_DIR, 'fake_images%03d.png' % epoch))
def main(): args = parse_args() upsampling_factor = args.upsampling_factor num_tiles_w = args.num_tiles_w num_tiles_h = args.num_tiles_h data_path = args.data_path if data_path[-1] == '/': data_path = data_path[:-1] # Create output path out_path = "{}_x{}".format(data_path, upsampling_factor) utils.make_dir_if_not_exists(out_path) # List of all tiles data_path = args.data_path img_list = utils.get_list_files( data_path, ext=args.img_ext ) # Load the mean values mean_values_path = os.path.join( args.mean_values_path, "x{}".format(upsampling_factor), "mean_values.pickle" ) with open(mean_values_path, "rb") as fid: mean_values = pickle.load(fid) mean_val_gt = mean_values["gt"] mean_val_down = mean_values["down"] mean_val_bic_up = mean_values["bic_up"] """ BUILD THE MODEL """ # Create placeholders model_ph = create_placeholders(args.batch_size, args.nchan) # Graph Model params_superres_net = { "num_FB_layers" : args.num_FB_layers, "num_dist_blocks" : args.num_Dist_blocks, "upsampling_factor": upsampling_factor } input_superres_net = { "tf_init" : model_ph["init"], "tf_upsampled": model_ph["upsampled"], } tf_output_text, _, _ = build_model( params_superres_net, input_superres_net ) # Create saver for the model model_saver = tf.train.Saver(save_relative_paths=True) """ SUPER RESOLVE ALL IMAGES """ with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Restore the model restore_model( args.pretrained_path, upsampling_factor, model_saver, sess ) # Loop over all images for img_id, img_name in enumerate(img_list): print("Processing image {}".format(img_name)) # Debug if args.debug: debug_tile_path = os.path.join( out_path, "{}_Tiles".format(img_name[:-4]) ) utils.make_dir_if_not_exists(debug_tile_path) # Open image img_path = os.path.join(data_path, img_name) full_img = cv2.imread(img_path) height_full_lr, width_full_lr, _ = full_img.shape # Create tiles res_tiles_w = width_full_lr // num_tiles_w + 1 res_tiles_h = height_full_lr // num_tiles_h + 1 # Reconstructing super_resolved_img = [] bic_up_img = [] for tile_w_id in range(num_tiles_w): tile_w_start = tile_w_id * res_tiles_w tile_w_end = min(tile_w_start + res_tiles_w, width_full_lr) super_resolved_col = [] bic_up_col = [] for tile_h_id in range(num_tiles_h): tile_h_start = tile_h_id * res_tiles_h tile_h_end = min(tile_h_start + res_tiles_h, height_full_lr) print(" ---> Tile ({:2d}, {:2d})".format(tile_w_id, tile_h_id)) # Extract tile image tile_img = full_img[ tile_h_start: tile_h_end, tile_w_start: tile_w_end, : ] height_tile_lr, width_tile_lr, _ = tile_img.shape # Upsample image - BICUBIC height_tile_hr = upsampling_factor * height_tile_lr width_tile_hr = upsampling_factor * width_tile_lr tile_img_bic_up = cv2.resize( tile_img, (0,0), fx=upsampling_factor, fy=upsampling_factor, interpolation = cv2.INTER_CUBIC ) bic_up_col.append(tile_img_bic_up) # Convert low res to YCbCr tile_img_ycbcr = cv2.cvtColor(tile_img, cv2.COLOR_BGR2YCR_CB) tile_img_y = tile_img_ycbcr[:,:,0] # Convert high res to YCbCr tile_img_bic_up_ycbcr = cv2.cvtColor(tile_img_bic_up, cv2.COLOR_BGR2YCR_CB) tile_img_bic_up_y = tile_img_bic_up_ycbcr[:,:,0] tile_img_bic_up_cbcr = tile_img_bic_up_ycbcr[:,:,1:] # Normalize and center y channel tile_img_y = normalize_center(tile_img_y, mean_val_down) tile_img_bic_up_y = normalize_center(tile_img_bic_up_y, mean_val_bic_up) # Create the feed dictionary feed_dict = create_feed_dict(tile_img_y, tile_img_bic_up_y, model_ph) # Run the model result = sess.run( tf_output_text, feed_dict=feed_dict ) # Unnoramlize the results result += mean_val_gt result = np.maximum(0.0, np.minimum(result, 1.0)) result *= 255.0 # Colorize image result_colored = np.concatenate([result[0,::], tile_img_bic_up_cbcr], axis=-1) super_resolved_col.append(result_colored) # Save image if args.debug: tile_img_hr = result_colored.copy() #np.concatenate([result[0,::], tile_img_bic_up_cbcr], axis=-1) tile_img_hr = cv2.cvtColor(tile_img_hr.astype(np.uint8), cv2.COLOR_YCR_CB2BGR) tile_img_name = os.path.join( debug_tile_path, "{}_{:02d}_{:02d}.tif".format( img_name[:-4], tile_w_id, tile_h_id ) ) cv2.imwrite(tile_img_name, tile_img_hr) tile_img_bic_up_name = os.path.join( debug_tile_path, "{}_{:02d}_{:02d}_bic_up.tif".format( img_name[:-4], tile_w_id, tile_h_id ) ) cv2.imwrite(tile_img_bic_up_name, tile_img_bic_up) super_resolved_img.append(np.concatenate(super_resolved_col, axis=0)) bic_up_img.append(np.concatenate(bic_up_col, axis=0)) full_img_hr = np.concatenate(super_resolved_img, axis=1) full_img_hr = cv2.cvtColor(full_img_hr.astype(np.uint8), cv2.COLOR_YCR_CB2BGR) full_img_name = os.path.join(out_path, img_name) cv2.imwrite(full_img_name, full_img_hr) full_img_bicup_hr = np.concatenate(bic_up_img, axis=1) full_img_bicup_name = os.path.join( out_path, "{}_bic_up.tif".format(img_name[:-4]) ) cv2.imwrite(full_img_bicup_name, full_img_bicup_hr)
fout.write("\n") fout.write("\n\n\n") fout.close() if __name__ == '__main__': #----------------preprocessing data------------------- conll_folder = "Conll_Format_Data/" utils.make_dir_if_not_exists(conll_folder) input_standoff_folder_train = parameters["train_data"] conll_folder_train = "Conll_Format_Data/train/" utils.make_dir_if_not_exists(conll_folder_train) conll_file_train = 'Conll_Format_Data/train_conll.txt' input_standoff_folder_test = parameters["test_data"] conll_folder_test = "Conll_Format_Data/test/" utils.make_dir_if_not_exists(conll_folder_test) conll_file_test = 'Conll_Format_Data/test_conll.txt'
def fetch(symbols_file, indicators_file, output_path): '''fetches stock data combined with technical indicators, output as csv''' # read from symbols file stocks = [] with open(utils.format_path(symbols_file), 'r') as data: read_data = data.read() stocks = str(read_data).split() # read from indicators file indicators = [] with open(utils.format_path(indicators_file), 'r') as data: read_data = data.read() indicators = str(read_data).split() stocks_config = { 'function': constants.TIME_SERIES_DAILY_ADJUSTED, 'output_size': constants.OUTPUTSIZE_FULL, 'data_type': constants.DATATYPE_JSON, 'api_key': constants.API_KEY } indicators_config = { 'interval': constants.INTERVAL, 'time_period': constants.TIME_PERIOD, 'series_type': constants.SERIES_TYPE, 'api_key': constants.API_KEY } for stock in stocks: start = time.time() stock_data = fetch_stock.fetch(stock, stocks_config) time.sleep(1) dfs = [] dfs.append(stock_data) for indicator in indicators: indicator_data = fetch_indicators.fetch(indicator, stock, indicators_config) time.sleep(1) dfs.append(indicator_data) stock_indicators_joined = reduce( lambda left, right: pd.merge( left, right, left_index=True, right_index=True, how='outer'), dfs) stock_indicators_joined.index.name = 'date' # print(stock_indicators_joined) print('fetched and joined data for ' + stock) formatted_output_path = utils.format_path(output_path) utils.make_dir_if_not_exists(output_path) stock_indicators_joined.to_csv(formatted_output_path + '/' + stock + '.csv') print('saved csv file to ' + formatted_output_path + '/' + stock + '.csv') elapsed = time.time() - start print('time elapsed: ' + str(round(elapsed, 2)) + " seconds")
def graph_(args): log = logging.getLogger() data_list = [] for file_name in utils.get_data_file_list(args.data_dir): data = utils.load_json(os.path.join(args.data_dir, file_name)) if data['type'] != 'simulation': continue if data['channel'] == args.channel: data_list.append((file_name, data)) def filter_data(expr, comp=None): ll = [] for name, item in data_list: # print('filter:', name) if expr(item): log.info('Match: %s' % name) ll.append(item) if comp is not None: ll.sort(key=comp) return ll def extra_filter(it): if 'max_iter' in it.keys() and args.max_iter is not None: return int(it['max_iter']) == args.max_iter # elif 'eps' in it.keys() and args.eps is not None: # return float(it['eps']) == args.eps # elif 'mu' in it.keys() and args.mu is not None: # return float(it['mu']) == args.mu else: return True def get_first(ll, rsn): if len(ll) == 0: log.error('No matching data found for: %s.' % rsn) exit() else: return ll[0] prefix_code = lambda ar_: ar_.get('prefix', '') + ar_.get('code', '') prefix_or_code = lambda it, ar_: ( it.get('code', '') == args.code or it.get('prefix', '') == args.code or it.get('code', '') == args.extra or it.get('prefix', '') == args.extra) if args.type == 'single': chk = lambda it: it.get('code', '') == args.code and \ it.get('decoder', '') == args.decoder[0] and \ extra_filter(it) data = get_first(filter_data(chk), 'single') plot_(data[args.error], 'k-', data['decoder']) title = def_title(args) err_plt(args) elif args.type == 'compare': chk = lambda it: prefix_or_code(it, args) and \ it.get('decoder', '') == args.decoder[0] and \ extra_filter(it) for data, style in zip(filter_data(chk), line_styles4): plot_(data[args.error], style, prefix_code(data)) title = args.channel.upper() + ', %s decoder' % args.decoder[0] err_plt(args) elif args.type == 'comp_dec': chk = lambda it: prefix_or_code(it, args) and \ it.get('decoder', '') in args.decoder and \ extra_filter(it) filtered = filter_data(chk) same_code = len(set([prefix_code(data) for data in filtered ])) <= 1 # check if all are for same code for data, style in zip(filtered, line_styles4): decoder = data['decoder'] # leg = '%s-%s%s' % (decoder, prefix_code(data), '-' + str(data.get('layers'))) leg = decoder if same_code else '%s-%s' % (decoder, prefix_code(data)) plot_(data[args.error], style, leg) title = def_title(args) if same_code else 'Comparison of decoders' err_plt(args) elif args.type == 'ensemble': chk = lambda it: it.get('decoder', '') == args.decoder[0] and \ re.compile('^' + args.code + '_[0-9]+$'). \ match(it.get('code', '')) log.info('Matching ensemble codes') for data in filter_data(chk): plot_(data[args.error], 'r--', None) chk_avg = lambda it: 'sources' in it.keys() and \ it.get('prefix', '') == args.code and \ it.get('decoder', '') == args.decoder[0] log.info('Searching for average') plot_( get_first(filter_data(chk_avg), 'average')[args.error], 'b-', 'Average') title = def_title( args) + ' code ensemble' + ', %s decoder' % args.decoder[0] err_plt(args) elif args.type == 'max_iter': chk = lambda it: it.get('code', '') == args.code and \ it.get('decoder', '') == args.decoder[0] and \ 'max_iter' in it.keys() for data, style in zip( filter_data(chk, lambda it: int(it['max_iter'])), line_styles): decoder = data['decoder'] plot_(data[args.error], style, data['max_iter']) title = def_title(args) + ', %s decoder' % args.decoder[ 0] + ', Effect of iterations cap' err_plt(args) elif args.type == 'hist_iter': chk = lambda it: it.get('code', '') == args.code and \ it.get('decoder', '') == args.decoder[0] data = get_first(filter_data(chk), 'single') # plot_(data[args.error], 'k-', data['decoder']) series = data['dec'][str(args.param)]['iter'] xvals = range(len(series)) avg = sum([a1_ * a2_ for a1_, a2_ in zip(xvals, series)]) / sum(series) plt.bar(xvals, series, label='Average=%g' % avg) plt.xlabel('Number of iterations') plt.gca().set_yticks([]) title = '' elif args.type == 'avg_iter': chk = lambda it: it.get('code', '') == args.code and \ it.get('decoder', '') in args.decoder for data in filter_data(chk): # plot_(data[args.error], 'k-', data['decoder']) params = sorted([param for param in data['dec'].keys()]) avgs = [data['dec'][param]['average'] for param in params] plt.plot(params, avgs, label=data['decoder']) plt.xlabel(x_labels[args.channel]) plt.ylabel('Average number of iterations') plt.grid(True, which='both') title = '' else: return plt.legend(loc='best') if args.xlim is not None: plt.xlim(args.xlim) if args.ylim is not None: plt.ylim(args.ylim) plt.title(title) plt.margins(0) # autoscale(tight=True) if args.save is not None: utils.make_dir_if_not_exists(args.plots_dir) img_path = os.path.join(args.plots_dir, args.save) plt.savefig(img_path, bbox_inches='tight') if not args.silent: plt.show()