def _sort_out_coupling_config(cls, cfg: utils.ConfigReader, purpose: str) -> utils.ConfigReader: """ This method ensures that the SpacePair object initializes its couplings with complete parameter settings. That is, it fills the gaps in the config with default values from default.py :param purpose: should be either "translation" or "projection" """ c = {} # this will be turned into a ConfigReader object # Make a config during runtime if purpose == "custom": # assign the specified values, and default values to non-specified parameters for k, default_value in default.COUPLING_CONFIG.items(): c[k] = cfg.params.get(k, default_value) return utils.ConfigReader("custom_config.cfg", param_dict=c) # Load a config from a file file_for_loading = cfg(purpose + "_coupling_pretrained_reldir") c.update({"pretrained_loc": file_for_loading}) out_reldir = cfg.params.get( purpose + "_coupling_save_reldir", "outputs/" + purpose + "_coupling_default/") c.update({"out_absdir": cfg("root_abspath") + out_reldir}) # if the spacepair config doesn't give a size, try the coupling config instead c.update({"size": cfg.params.get(purpose + "_coupling_size", None)}) if c["size"] is None: if cfg.params.get(purpose + "_coupling_size", None) is None: if cfg.params.get(purpose + "_coupling_pretrained_reldir", None) is not None: pretrained_cfg = utils.ConfigReader( cfg.params.get(purpose + "_coupling_config_relpath")) c.update({"size": pretrained_cfg("size")}) if purpose == "projection": if "projection_matrix_size" in cfg.params.keys(): c.update({"max_anchors": cfg("projection_matrix_size")}) else: c.update({"max_anchors": cfg("projection_coupling_size")}) params_relpath = cfg(purpose + "_coupling_config_relpath") # make a default config because the path parameter is not specified if not params_relpath: print( f"WARNING: parameter {purpose}_coupling_config_relpath not found." f"Continuing with default parameters for this aligner.") c = default.COUPLING_CONFIG return utils.ConfigReader(f"{purpose}_coupling_config.cfg", c) # load the config from the file specified in the SpacePair config else: tmp_cfg = utils.ConfigReader(cfg("root_abspath") + params_relpath) tmp_cfg.params.update(c) return tmp_cfg
def from_config(cls, config_file, init_all=False): cfg = utils.ConfigReader(config_file) # checks for required parameters ans builds file paths cfg = cls._sort_out_filepaths(cfg) space_x, voc_x = utils.load_space(cfg("source_space_relpath")) space_y, voc_y = utils.load_space(cfg("target_space_relpath")) # handles loading from one file or two files freq_x, freq_y = cls._sort_out_freq_dists(cfg, voc_x, voc_y) # handles input of individual parameters and default parameters gwot1_config = cls._sort_out_coupling_config(cfg, purpose='translation') gwot2_config = cls._sort_out_coupling_config(cfg, purpose='projection') return cls(space_x, space_y, voc_x, voc_y, freq_x, freq_y, cfg, gwot1_config, gwot2_config, init_all=init_all)
def init_working_parts(self, gwot1_config: utils.ConfigReader = None, gwot2_config: utils.ConfigReader = None): """ Initializes the two GWOT objects of a SpacePair and obtain the set of translation pairs as well as the mapping (T and P, respectively). If a config is not specified, it uses the default config from default.py. """ print("Initializing Gromov-Wasserstein Aligners ...") # initialize default configs if none are specified if gwot1_config is None: gwot1_config = self._sort_out_coupling_config( utils.ConfigReader("", {}), "custom") print(f" WARNING: no config provided for aligner 1. " f"Continuing with default settings.") if gwot2_config is None: gwot2_config = self._sort_out_coupling_config( utils.ConfigReader("", {}), "custom") print(f" WARNING: no config provided for aligner 2. " f"Continuing with default settings.") # initialize aligners and their optimizers self.gwot1 = GWOT(gwot1_config, self.voc_x, self.voc_y, x_freq=self.freq_x, y_freq=self.freq_y, size=self.cfg("translation_coupling_size")) self.gwot2 = GWOT(gwot2_config, self.voc_x, self.voc_y, x_freq=self.freq_x, y_freq=self.freq_y, size=self.cfg("projection_coupling_size")) print( f"\nTrying to get T and P (translation pairs and projection matrix) ..." ) self.T = self.gwot1.sort_out_scored_mutual_nn() self.P = self.gwot2.sort_out_mapping(self.X, self.Y, self.voc_x, self.voc_y)
def compile_config(self) -> utils.ConfigReader: cfg = { "pretrained_loc": self.pretrained_loc, "out_absdir": self.out_absdir, "score_type": self.score_type, "adjust": self.adjust, "metric": self.metric, "normalize_vecs": self.normalize_vecs, "normalize_dists": self.normalize_dists, "distribs": self.distribs, "share_vocs": self.share_vocs, "size": self.size } cfg.update({"opt_" + k: v for k, v in self.opt_config.items()}) cfg.update({"fit_" + k: v for k, v in self.fit_config.items()}) return utils.ConfigReader("", param_dict=cfg)
def read_results(dir_to_files:str, file_stub:str, exptype:str, with_baseline=True, tuples=False) -> (utils.ConfigReader, DataFrame, DataFrame): """ Read exeriment results from a directory. containing at least 2 .tsv files and one text file. :param dir_to_files: directory path. :param file_stub: specifies the sub-group of results. The 3 files to be read start with this stub. e.g. 'all_technical' :param exptype: one of ["dis_tech", "distech", "unsup_mono", "unsup_bi"] :param with_baseline: if True, this also loads a clustering_baseline.tsv :param tuples: if True, this acknowledges the 'centroid' column as containing tuples :return: experiment statistics, DataFrame with pair distances, DataFrame with clustering results """ if exptype in ["dis_tech", "distech", "dis_tech/", "distech/"]: dists_file = dir_to_files + file_stub + "_dists.tsv" else: dists_file = dir_to_files + file_stub + "_pairdists.tsv" stats_file = dir_to_files + file_stub + "_clustering_stats" clust_file = dir_to_files + file_stub + "_shift_clusters.tsv" stats = utils.ConfigReader(stats_file) dists = pandas.read_csv(dists_file, header=0, index_col=0, delimiter="\t") clust = pandas.read_csv(clust_file, header=0, index_col=0, delimiter="\t") dists["src_neighbors"] = [ast.literal_eval(x) for x in dists["src_neighbors"]] dists["trg_neighbors"] = [ast.literal_eval(x) for x in dists["trg_neighbors"]] if tuples is True: clust["centroid"] = [ast.literal_eval(x) for x in clust["centroid"]] clust["direction_label"] = [ast.literal_eval(x) for x in clust["direction_label"]] clust["cluster_words"] = [ast.literal_eval(x) for x in clust["cluster_words"]] if with_baseline is True: bl_file = dir_to_files + file_stub + "_clustering_baseline.tsv" try: baseline = pandas.read_csv(bl_file, header=0, index_col=0, delimiter="\t") return stats, dists, clust, baseline except FileNotFoundError: print(f"WARNING: unable to find baseline (=random) clusters. " f"Maybe deactivate the loading by setting 'with_baseline=False'?") return stats, dists, clust else: return stats, dists, clust
"selection with empty input") if exp_in == None: if not experiments: experiments = ["unsup_mono"] break else: break elif exp_in in default.SHIFT_EXPERIMENTS: experiments.append(exp_in) else: print(f"Invalid experiment name '{exp_in}'.") print(f"Performing the following experiments:\n {experiments}") #========== CONFIGURATION SETUP =================== take_time = utils.Timer() cfg = utils.ConfigReader(config_relpath) absdir = cfg("root_abspath") cfg.set("source_year", source_year) cfg.set("target_year", target_year) yearstring = str(source_year) + "_" + str(target_year) outdir = absdir + "outputs/" + cfg("subproject_dir") + yearstring + "/" visuals_dir = absdir + "visuals/" + cfg("subproject_dir") + yearstring + "/" for path in [outdir, visuals_dir]: if not os.path.isdir(path): os.makedirs(path) # load spaces cfg.set(
import GWOT import time import os import numpy as np import datetime import matplotlib.pyplot as plt take_time = utils.Timer() config_abspath = utils.loop_input(rtype=str, default="config/optimize_couplings.cfg", msg="Path to the configuration file") cfg = utils.ConfigReader(config_abspath) for path in [cfg("out_absdir"), cfg("visuals_absdir")]: if not os.path.exists(path): os.makedirs(path) # ensure that no statistics get overwritten and print the header of the stats file stats_file_abspath = cfg("out_absdir")+"statistics" if os.path.exists(stats_file_abspath): stats_file_abspath = stats_file_abspath + "_"+str(datetime.datetime.fromtimestamp(time.time()).isoformat()) with open(stats_file_abspath, "w") as f: f.write("\t".join("year1 year2 size distribs" "pairs matches mismatches " "mu_matches med_matches "
#=========== PARAMETER INPUT take_time = utils.Timer() parser = argparse.ArgumentParser() parser.add_argument('config_file', metavar='config', type=str, help='configuration file for training') parser.add_argument('model_name', metavar='model', type=str, help="name of the model's file") args = parser.parse_args() #TODO make sure that the config contains all required parameters cfg = utils.ConfigReader(args.config_file) #TODO change path assignment to fit with the program model_abs_path = cfg('model_abs_dir') + args.model_name + "/" model_filepath = model_abs_path + args.model_name losses_abs_path = model_abs_path + "losses" # contains (batch_size, overall_loss, sup_l., start_l., end_l., type_l.) traintime_abs_path = model_abs_path + "times" devscores_abs_path = model_abs_path + "devscores" eval_data_dump_dir = cfg("eval_data_dump_dir") eval_data_dump_filepath = eval_data_dump_dir + "gold" eval_preds_dump_filepath = eval_data_dump_dir + "predictions" # check all relevant file paths and directories before starting training # make sure that the training data will be found for path in [cfg("data_abs_path"), cfg("dev_data_abs_path")]:
""" Some play-around code for putting things together. """ #TODO CLEANUP this file? It's like train_dfgn.py, but in old. import utils from modules.ParagraphSelector import ParagraphSelector from modules.EntityGraph import EntityGraph from modules.Encoder import Encoder main_cfg_file = utils.loop_input(rtype="filepath", default="config/dfgn.cfg", msg="Enter configuration file") cfg = utils.ConfigReader(main_cfg_file) #cfg.get_param_names() #CLEANUP dh = utils.HotPotDataHandler(cfg("HotPotQA_filepath")) data = dh.data_for_paragraph_selector() ps = ParagraphSelector(model_path=cfg("ps_model_file"), tokenizer=cfg("ps_tokenizer"), encoder_model=cfg("ps_encoder_model")) enc = Encoder() #TODO fill out this command once Encoder is done! avg_degrees = [] # for analysis purposes for datapoint in data: query = datapoint[1] """ Paragraph Selector """ context = ps.make_context(datapoint, threshold=cfg("ps_threshold"))