예제 #1
0
    def _sort_out_coupling_config(cls, cfg: utils.ConfigReader,
                                  purpose: str) -> utils.ConfigReader:
        """
        This method ensures that the SpacePair object initializes its couplings
        with complete parameter settings. That is, it fills the gaps in the
        config with default values from default.py
        :param purpose: should be either "translation" or "projection"
        """

        c = {}  # this will be turned into a ConfigReader object

        # Make a config during runtime
        if purpose == "custom":
            # assign the specified values, and default values to non-specified parameters
            for k, default_value in default.COUPLING_CONFIG.items():
                c[k] = cfg.params.get(k, default_value)
            return utils.ConfigReader("custom_config.cfg", param_dict=c)

        # Load a config from a file
        file_for_loading = cfg(purpose + "_coupling_pretrained_reldir")
        c.update({"pretrained_loc": file_for_loading})

        out_reldir = cfg.params.get(
            purpose + "_coupling_save_reldir",
            "outputs/" + purpose + "_coupling_default/")
        c.update({"out_absdir": cfg("root_abspath") + out_reldir})

        # if the spacepair config doesn't give a size, try the coupling config instead
        c.update({"size": cfg.params.get(purpose + "_coupling_size", None)})
        if c["size"] is None:
            if cfg.params.get(purpose + "_coupling_size", None) is None:
                if cfg.params.get(purpose + "_coupling_pretrained_reldir",
                                  None) is not None:
                    pretrained_cfg = utils.ConfigReader(
                        cfg.params.get(purpose + "_coupling_config_relpath"))
                    c.update({"size": pretrained_cfg("size")})

        if purpose == "projection":
            if "projection_matrix_size" in cfg.params.keys():
                c.update({"max_anchors": cfg("projection_matrix_size")})
            else:
                c.update({"max_anchors": cfg("projection_coupling_size")})

        params_relpath = cfg(purpose + "_coupling_config_relpath")
        # make a default config because the path parameter is not specified
        if not params_relpath:
            print(
                f"WARNING: parameter {purpose}_coupling_config_relpath not found."
                f"Continuing with default parameters for this aligner.")
            c = default.COUPLING_CONFIG
            return utils.ConfigReader(f"{purpose}_coupling_config.cfg", c)
        # load the config from the file specified in the SpacePair config
        else:
            tmp_cfg = utils.ConfigReader(cfg("root_abspath") + params_relpath)
            tmp_cfg.params.update(c)
            return tmp_cfg
예제 #2
0
    def from_config(cls, config_file, init_all=False):
        cfg = utils.ConfigReader(config_file)

        # checks for required parameters ans builds file paths
        cfg = cls._sort_out_filepaths(cfg)

        space_x, voc_x = utils.load_space(cfg("source_space_relpath"))
        space_y, voc_y = utils.load_space(cfg("target_space_relpath"))

        # handles loading from one file or two files
        freq_x, freq_y = cls._sort_out_freq_dists(cfg, voc_x, voc_y)

        # handles input of individual parameters and default parameters
        gwot1_config = cls._sort_out_coupling_config(cfg,
                                                     purpose='translation')
        gwot2_config = cls._sort_out_coupling_config(cfg, purpose='projection')

        return cls(space_x,
                   space_y,
                   voc_x,
                   voc_y,
                   freq_x,
                   freq_y,
                   cfg,
                   gwot1_config,
                   gwot2_config,
                   init_all=init_all)
예제 #3
0
    def init_working_parts(self,
                           gwot1_config: utils.ConfigReader = None,
                           gwot2_config: utils.ConfigReader = None):
        """
        Initializes the two GWOT objects of a SpacePair and obtain the set of
        translation pairs as well as the mapping (T and P, respectively).
        If a config is not specified, it uses the default config from default.py.
        """
        print("Initializing Gromov-Wasserstein Aligners ...")

        # initialize default configs if none are specified
        if gwot1_config is None:
            gwot1_config = self._sort_out_coupling_config(
                utils.ConfigReader("", {}), "custom")
            print(f"   WARNING: no config provided for aligner 1. "
                  f"Continuing with default settings.")
        if gwot2_config is None:
            gwot2_config = self._sort_out_coupling_config(
                utils.ConfigReader("", {}), "custom")
            print(f"   WARNING: no config provided for aligner 2. "
                  f"Continuing with default settings.")

        # initialize aligners and their optimizers
        self.gwot1 = GWOT(gwot1_config,
                          self.voc_x,
                          self.voc_y,
                          x_freq=self.freq_x,
                          y_freq=self.freq_y,
                          size=self.cfg("translation_coupling_size"))
        self.gwot2 = GWOT(gwot2_config,
                          self.voc_x,
                          self.voc_y,
                          x_freq=self.freq_x,
                          y_freq=self.freq_y,
                          size=self.cfg("projection_coupling_size"))

        print(
            f"\nTrying to get T and P (translation pairs and projection matrix) ..."
        )
        self.T = self.gwot1.sort_out_scored_mutual_nn()
        self.P = self.gwot2.sort_out_mapping(self.X, self.Y, self.voc_x,
                                             self.voc_y)
예제 #4
0
    def compile_config(self) -> utils.ConfigReader:
        cfg = {
            "pretrained_loc": self.pretrained_loc,
            "out_absdir": self.out_absdir,
            "score_type": self.score_type,
            "adjust": self.adjust,
            "metric": self.metric,
            "normalize_vecs": self.normalize_vecs,
            "normalize_dists": self.normalize_dists,
            "distribs": self.distribs,
            "share_vocs": self.share_vocs,
            "size": self.size
        }
        cfg.update({"opt_" + k: v for k, v in self.opt_config.items()})
        cfg.update({"fit_" + k: v for k, v in self.fit_config.items()})

        return utils.ConfigReader("", param_dict=cfg)
예제 #5
0
def read_results(dir_to_files:str, file_stub:str, exptype:str, with_baseline=True, tuples=False) -> (utils.ConfigReader, DataFrame, DataFrame):
    """
    Read exeriment results from a directory. containing at least 2 .tsv files and one text file.
    :param dir_to_files: directory path.
    :param file_stub: specifies the sub-group of results. The 3 files to be read
     start with this stub. e.g. 'all_technical'
    :param exptype: one of ["dis_tech", "distech", "unsup_mono", "unsup_bi"]
    :param with_baseline: if True, this also loads a clustering_baseline.tsv
    :param tuples: if True, this acknowledges the 'centroid' column as containing tuples
    :return: experiment statistics, DataFrame with pair distances, DataFrame with clustering results
    """
    if exptype in ["dis_tech", "distech", "dis_tech/", "distech/"]:
        dists_file = dir_to_files + file_stub + "_dists.tsv"
    else:
        dists_file = dir_to_files + file_stub + "_pairdists.tsv"

    stats_file = dir_to_files + file_stub + "_clustering_stats"
    clust_file = dir_to_files + file_stub + "_shift_clusters.tsv"

    stats = utils.ConfigReader(stats_file)
    dists = pandas.read_csv(dists_file, header=0, index_col=0, delimiter="\t")
    clust = pandas.read_csv(clust_file, header=0, index_col=0, delimiter="\t")


    dists["src_neighbors"] = [ast.literal_eval(x) for x in dists["src_neighbors"]]
    dists["trg_neighbors"] = [ast.literal_eval(x) for x in dists["trg_neighbors"]]

    if tuples is True:
        clust["centroid"] = [ast.literal_eval(x) for x in clust["centroid"]]
    clust["direction_label"] = [ast.literal_eval(x) for x in clust["direction_label"]]
    clust["cluster_words"] = [ast.literal_eval(x) for x in clust["cluster_words"]]

    if with_baseline is True:
        bl_file = dir_to_files + file_stub + "_clustering_baseline.tsv"
        try:
            baseline = pandas.read_csv(bl_file, header=0, index_col=0, delimiter="\t")
            return stats, dists, clust, baseline
        except FileNotFoundError:
            print(f"WARNING: unable to find baseline (=random) clusters. "
                  f"Maybe deactivate the loading by setting 'with_baseline=False'?")
            return stats, dists, clust
    else:
        return stats, dists, clust
        "selection with empty input")
    if exp_in == None:
        if not experiments:
            experiments = ["unsup_mono"]
            break
        else:
            break
    elif exp_in in default.SHIFT_EXPERIMENTS:
        experiments.append(exp_in)
    else:
        print(f"Invalid experiment name '{exp_in}'.")
print(f"Performing the following experiments:\n   {experiments}")

#========== CONFIGURATION SETUP ===================
take_time = utils.Timer()
cfg = utils.ConfigReader(config_relpath)

absdir = cfg("root_abspath")

cfg.set("source_year", source_year)
cfg.set("target_year", target_year)
yearstring = str(source_year) + "_" + str(target_year)

outdir = absdir + "outputs/" + cfg("subproject_dir") + yearstring + "/"
visuals_dir = absdir + "visuals/" + cfg("subproject_dir") + yearstring + "/"
for path in [outdir, visuals_dir]:
    if not os.path.isdir(path):
        os.makedirs(path)

# load spaces
cfg.set(
import GWOT

import time
import os

import numpy as np
import datetime

import matplotlib.pyplot as plt


take_time = utils.Timer()

config_abspath = utils.loop_input(rtype=str, default="config/optimize_couplings.cfg",
                                  msg="Path to the configuration file")
cfg = utils.ConfigReader(config_abspath)


for path in [cfg("out_absdir"), cfg("visuals_absdir")]:
    if not os.path.exists(path):
        os.makedirs(path)

# ensure that no statistics get overwritten and print the header of the stats file
stats_file_abspath = cfg("out_absdir")+"statistics"
if os.path.exists(stats_file_abspath):
    stats_file_abspath = stats_file_abspath + "_"+str(datetime.datetime.fromtimestamp(time.time()).isoformat())

with open(stats_file_abspath, "w") as f:
    f.write("\t".join("year1 year2 size distribs"
                      "pairs matches mismatches "
                      "mu_matches med_matches "
예제 #8
0
    #=========== PARAMETER INPUT
    take_time = utils.Timer()

    parser = argparse.ArgumentParser()
    parser.add_argument('config_file',
                        metavar='config',
                        type=str,
                        help='configuration file for training')
    parser.add_argument('model_name',
                        metavar='model',
                        type=str,
                        help="name of the model's file")
    args = parser.parse_args()

    #TODO make sure that the config contains all required parameters
    cfg = utils.ConfigReader(args.config_file)

    #TODO change path assignment to fit with the program
    model_abs_path = cfg('model_abs_dir') + args.model_name + "/"
    model_filepath = model_abs_path + args.model_name
    losses_abs_path = model_abs_path + "losses"  # contains (batch_size, overall_loss, sup_l., start_l., end_l., type_l.)
    traintime_abs_path = model_abs_path + "times"
    devscores_abs_path = model_abs_path + "devscores"

    eval_data_dump_dir = cfg("eval_data_dump_dir")
    eval_data_dump_filepath = eval_data_dump_dir + "gold"
    eval_preds_dump_filepath = eval_data_dump_dir + "predictions"

    # check all relevant file paths and directories before starting training
    # make sure that the training data will be found
    for path in [cfg("data_abs_path"), cfg("dev_data_abs_path")]:
예제 #9
0
"""
Some play-around code for putting things together.
"""
#TODO CLEANUP this file? It's like train_dfgn.py, but in old.

import utils
from modules.ParagraphSelector import ParagraphSelector
from modules.EntityGraph import EntityGraph
from modules.Encoder import Encoder

main_cfg_file = utils.loop_input(rtype="filepath",
                                 default="config/dfgn.cfg",
                                 msg="Enter configuration file")

cfg = utils.ConfigReader(main_cfg_file)
#cfg.get_param_names() #CLEANUP

dh = utils.HotPotDataHandler(cfg("HotPotQA_filepath"))
data = dh.data_for_paragraph_selector()

ps = ParagraphSelector(model_path=cfg("ps_model_file"),
                       tokenizer=cfg("ps_tokenizer"),
                       encoder_model=cfg("ps_encoder_model"))
enc = Encoder()  #TODO fill out this command once Encoder is done!

avg_degrees = []  # for analysis purposes

for datapoint in data:
    query = datapoint[1]
    """ Paragraph Selector """
    context = ps.make_context(datapoint, threshold=cfg("ps_threshold"))