def start_worker( experiment_id, experiment_named_id, named_combination, cmd, human_cmd, experiment_path, running_processes, ): if "--device" in cmd: device_index_in_list = cmd.index("--device") device = cmd[device_index_in_list + 1] else: device = -1 running_processes[experiment_id] = (True, device) logger = get_logger() logger.debug("starting single setup: {}".format(human_cmd)) with open(os.path.join(experiment_path, "stdlog.out"), "w") as file_stdout, open( os.path.join(experiment_path, "stdlog.err"), "w" ) as file_stderr: completed_process = subprocess.run(cmd, stdout=file_stdout, stderr=file_stderr) experiment_details = get_experiment_result_detailed(experiment_path) running_processes[experiment_id] = (False, device) return { **named_combination, **{"rc": completed_process.returncode, "experiment_id": experiment_id}, "details": experiment_details, "experiment_named_id": experiment_named_id, }
def __init__(self, sorted_expected_label_values, polarity_associations, snem_name): self.logger = get_logger() self.polarity_associations = polarity_associations self.pos_label_value = polarity_associations['positive'] self.neg_label_value = polarity_associations['negative'] self.sorted_expected_label_values = sorted_expected_label_values self.pos_label_index = self.sorted_expected_label_values.index( self.pos_label_value) self.neg_label_index = self.sorted_expected_label_values.index( self.neg_label_value) self.snem_name = snem_name
def __init__(self, dataset: Dataset, random_seed=None): x = [] y = [] for ind, example in enumerate(dataset): x.append(ind) y.append(example["polarity"]) x_arr = np.asarray(x).reshape((len(x), 1)) y_arr = np.asarray(y).ravel() ros = RandomOverSampler(random_state=random_seed) x_sampled, y_sampled = ros.fit_resample(x_arr, y_arr) self.sampled_indexes = x_sampled.ravel().tolist() sampled_labels = y_sampled.tolist() assert len(self.sampled_indexes) == len(sampled_labels) random.shuffle(self.sampled_indexes) get_logger().info( f"oversampled to {len(self.sampled_indexes)} samples. label distribution: {Counter(sampled_labels)}" )
def __init__(self, mode: str, max_seq_length: int, max_hop_distance: int = 10): self.logger = get_logger() self.nlp = self.__get_spacy() self.tag2ind, self.ind2tag = self.__create_association() self.num_tags = len(list(self.tag2ind.keys())) self.max_seq_length = max_seq_length self.max_hop_distance = max_hop_distance self.mode = mode cache_filepath = DependencyParser.__CACHE_FILEPATH_TEMPLATE.format( mode) self.cache = shelve.open(cache_filepath) self.logger.info( f"loaded cache with {len(self.cache)} entries from {cache_filepath}" )
def __init__(self, patience=2, delta=0.01): """ Args: patience (int): How long to wait after last time validation loss improved. Default: 2 verbose (bool): If True, prints a message for each validation loss improvement. Default: False delta (float): Minimum change in the monitored quantity to qualify as an improvement. Default: 0 """ self.patience = patience self.counter = 0 self.best_score = None self.early_stop = False self.delta = delta self.logger = get_logger() self.flag_has_score_increased_since_last_check = False
def __init__(self, name, basepath_datasets, human, non_human): self.basepath_datasets = basepath_datasets self.name = name self.human_created_filenames = human self.non_human_created_filenames = non_human self.human_created_filepaths = [ self.get_filepath_by_name(x) for x in self.human_created_filenames ] self.non_human_created_filepaths = [ self.get_filepath_by_name(x) for x in self.non_human_created_filenames ] self.data_types = ["human", "nonhum"] self.sets_info = None self.random_seed = 1337 random.seed(self.random_seed) self.logger = get_logger() self.examples_human = self.files_to_dictlst( self.human_created_filepaths) self.examples_nonhum = self.files_to_dictlst( self.non_human_created_filepaths) self.logger.info("shuffling example lists with seed {}".format( self.random_seed)) random.shuffle(self.examples_human) random.shuffle(self.examples_nonhum) self.logger.info( "{} examples read created by humans (from: {})".format( len(self.examples_human), self.human_created_filepaths)) self.logger.info( "{} examples read not created by humans (from: {})".format( len(self.examples_nonhum), self.non_human_created_filepaths))
import os import matplotlib.pyplot as plt import numpy as np from sklearn.metrics import confusion_matrix from fxlogger import get_logger logger = get_logger() def create_save_plotted_confusion_matrix(conf_matrix, expected_labels, basepath): ax, title = plot_confusion_matrix(conf_matrix, expected_labels, normalize=False) filepath = os.path.join(basepath, 'stats.png') plt.savefig(filepath, bbox_inches='tight') logger.debug("created confusion matrices in path: {}".format(filepath)) def plot_confusion_matrix(cm, classes, normalize=False, title=None, cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py """ if not title:
def __init__(self, global_context_max_seqs_per_doc): self.logger = get_logger() self.count_truncated = 0 self.count_all_sequences_where_we_count_truncation = 0 self.count_truncated_long_docs = 0 self.max_seqs_per_doc = global_context_max_seqs_per_doc
def __init__(self, options): self.logger = get_logger() self.opt = options if self.opt.cuda_devices: # to run on SCC if self.opt.cuda_devices == "SGE_GPU": self.cuda_devices = os.environ.get("SGE_GPU") else: self.cuda_devices = self.opt.cuda_devices if self.cuda_devices: self.logger.info("cuda devices:" + self.cuda_devices) self.cuda_devices = self.cuda_devices.split(",") self.logger.info( f"was assigned {len(self.cuda_devices)} cuda devices: {self.cuda_devices}" ) if self.opt.num_workers < 0: self.logger.info( f"num_workers < 0: using cuda device count. setting num_workers={len(self.cuda_devices)}" ) self.opt.num_workers = len(self.cuda_devices) else: # do not use CUDA self.cuda_devices = None self.use_cross_validation = 0 # if 0: do not use cross validation self.snem = "f1_macro" self.experiment_base_path = self.opt.experiments_path args_names_ordered = [ "model_name", "optimizer", "initializer", "learning_rate", "batch_size", "balancing", "num_epoch", "lsr", "use_tp_placeholders", "spc_lm_representation", "spc_input_order", "aen_lm_representation", "spc_lm_representation_distilbert", "finetune_glove", "eval_only_after_last_epoch", "devmode", "local_context_focus", "SRD", "pretrained_model_name", "state_dict", "use_global_context", "global_context_seqs_per_doc", "focus_mode", ] combinations = None if self.opt.combi_mode == "default": if self.opt.combi_id == 0: combinations = combinations_default_0 elif self.opt.combi_mode == "combinations_g": if self.opt.combi_id == 0: combinations = combinations_g_0 if not combinations: raise ValueError( "combination(mode={}, id={}) not defined".format( self.opt.combi_mode, self.opt.combi_id ) ) # key: name of parameter that is only applied if its conditions are met # pad_value: list of tuples, consisting of parameter name and the pad_value it needs to have in order for the # condition to be satisfied # Note that all tuples in this list are OR connected, so if at least one is satisfied, the conditions are met. # If we need AND connected conditions, my idea is to add an outer list, resulting in a list of lists (of # tuples) where all lists are AND connected. # If a condition is not satisfied, the corresponding parameter will still be pass conditions = { "spc_lm_representation_distilbert": [("model_name", "distilbert")], "spc_lm_representation": [ ("model_name", "spc_bert"), ("model_name", "spc_roberta"), ], "spc_input_order": [ ("model_name", "spc_bert"), ("model_name", "spc_roberta"), ("model_name", "spc_distilbert"), ], "aen_lm_representation": [ ("model_name", "aen_bert"), ("model_name", "aen_roberta"), ("model_name", "aen_distilbert"), ], "use_early_stopping": [("num_epoch", "10")], "finetune_glove": [("model_name", "aen_glove")], "local_context_focus": [("model_name", "lcf_bert")], "SRD": [("model_name", "lcf_bert")], "pretrained_model_name": [ ("model_name", "lcf_bert"), ("model_name", "aen_bert"), ("model_name", "spc_bert"), ], } assert len(args_names_ordered) == len(combinations.keys()) self.experiment_base_id = ( self.opt.dataset + "_" + datetime.today().strftime("%Y%m%d-%H%M%S") ) self.basecmd = ["python", "train.py"] self.basepath = "controller_data" self.basepath_data = os.path.join(self.basepath, "datasets") combination_count = 1 _combination_values = [] for arg_name in args_names_ordered: arg_values = list(combinations[arg_name]) combination_count = combination_count * len(arg_values) _combination_values.append(arg_values) combinations = list(product(*_combination_values)) assert len(combinations) == combination_count self.logger.info( "{} arguments, totaling in {} combinations".format( len(args_names_ordered), combination_count ) ) # apply conditions self.logger.info("applying conditions...") self.named_combinations, count_duplicates = self._apply_conditions( combinations, args_names_ordered, conditions ) self.logger.info( "applied conditions. removed {} combinations. {} -> {}".format( count_duplicates, combination_count, len(self.named_combinations) ) ) self.combination_count = len(self.named_combinations) if self.use_cross_validation > 0: self.logger.info( "using {}-fold cross validation".format(self.use_cross_validation) ) self.dataset_preparer = DatasetPreparer.poltsanews_crossval8010_allhuman( self.basepath_data ) else: self.logger.info( "not using cross validation".format(self.use_cross_validation) ) if self.opt.dataset == "poltsanews_rel801010_allhuman": ( self.dataset_preparer, self.datasetname, self.task_format, ) = DatasetPreparer.poltsanews_rel801010_allhuman(self.basepath_data) elif self.opt.dataset == "semeval14restaurants": ( self.dataset_preparer, self.datasetname, self.task_format, ) = DatasetPreparer.semeval14restaurants(self.basepath_data) elif self.opt.dataset == "semeval14laptops": ( self.dataset_preparer, self.datasetname, self.task_format, ) = DatasetPreparer.semeval14laptops(self.basepath_data) elif self.opt.dataset == "acl14twitter": ( self.dataset_preparer, self.datasetname, self.task_format, ) = DatasetPreparer.acl14twitter(self.basepath_data) elif self.opt.dataset == "sentinews": ( self.dataset_preparer, self.datasetname, self.task_format, ) = DatasetPreparer.sentinews(self.basepath_data) elif self.opt.dataset == "newstsc": ( self.dataset_preparer, self.datasetname, self.task_format, ) = DatasetPreparer.newstsc(self.basepath_data) elif self.opt.dataset == "newstsc2": ( self.dataset_preparer, self.datasetname, self.task_format, ) = DatasetPreparer.newstsc2(self.basepath_data) elif self.opt.dataset == "newstsc3": ( self.dataset_preparer, self.datasetname, self.task_format, ) = DatasetPreparer.newstsc3(self.basepath_data) elif self.opt.dataset == "newstsc4": ( self.dataset_preparer, self.datasetname, self.task_format, ) = DatasetPreparer.newstsc4(self.basepath_data) elif self.opt.dataset == "newstsc5": ( self.dataset_preparer, self.datasetname, self.task_format, ) = DatasetPreparer.newstsc5(self.basepath_data) elif self.opt.dataset == "newstscg": ( self.dataset_preparer, self.datasetname, self.task_format, ) = DatasetPreparer.newstscg(self.basepath_data) else: raise Exception("unknown dataset: {}".format(self.opt.dataset))