示例#1
0
 def __init__(self, arg_str=None):
     OptimizedInterleave.__init__(self, arg_str)
     parser = argparse.ArgumentParser()
     parser.add_argument('--allowed_leavings',
                         choices=['prefix_constraint',
                                  'prefix_constraint_va'],
                         default='prefix_constraint_va')
     parser.add_argument("--credit_va", action="store_true", default=False)
     parser.add_argument("--um_class", type=str,
                         default="environment.FederatedClickModel")
     parser.add_argument("--um_args", type=str,
                         default="0.2 0.1")
     args = vars(parser.parse_known_args(split_arg_str(arg_str))[0])
     self.allowed_leavings = getattr(self, args['allowed_leavings'])
     if args["credit_va"]:
         self.precompute_rank = self.precompute_rank_va
         self.um_class = get_class(args["um_class"])
         self.um = self.um_class(args["um_args"])
示例#2
0
         args["output_dir"])
    config_bk = os.path.join(args["output_dir"], "config_bk.yml")
    logging.info("Backing up configuration to: %s" % config_bk)
    config_bk_file = open(config_bk, "w")
    yaml.dump(args, config_bk_file, default_flow_style=False)
    config_bk_file.close()

    # initialize and run the experiment num_run times
    run_start_id = args["run_start_id"]
    num_runs = args["num_runs"]
    if args.get("num_random_draws") is not None:
        # Redefine num_runs, and use args["num_runs"] only when drawing
        # pair of rankers in the run() function above.
        num_runs = args["num_random_draws"]
        assert run_start_id == 0, "Conflicting options"
    experimenter = get_class(args["experimenter"])

    # set the random seed
    random.seed(42)

    if "processes" in args and args["processes"] > 1:
        from multiprocessing import Pool
        pool = Pool(processes=args["processes"])
        for run_id in range(run_start_id, run_start_id + num_runs):
            pool.apply_async(run, (run_id, experimenter, args,))
        pool.close()
        pool.join()
    else:
        for run_id in range(run_start_id, run_start_id + num_runs):
            run(run_id, experimenter, args)
示例#3
0
    def __init__(self, args_str=None):
        # parse arguments
        parser = argparse.ArgumentParser(description="""
            Construct and run a learning experiment. Provide either the name
            of a config file from which the experiment configuration is
            read, or provide all arguments listed under Command line. If
            both are provided the  config file is ignored.""",
                                         prog=self.__class__.__name__)

        # option 1: use a config file
        file_group = parser.add_argument_group("FILE")
        file_group.add_argument("-f",
                                "--file",
                                help="Filename of the config "
                                "file from which the experiment details"
                                " should be read.")

        # option 2: specify all experiment details as arguments
        detail_group = parser.add_argument_group("DETAILS")
        detail_group.add_argument(
            "-i",
            "--training_queries",
            help="File from which to load the training queries (svmlight "
            "format).")
        detail_group.add_argument(
            "-j",
            "--test_queries",
            help="File from which to load the test queries (svmlight format).")
        detail_group.add_argument(
            "-c",
            "--feature_count",
            type=int,
            help="The number of features included in the data.")
        detail_group.add_argument(
            "-r",
            "--num_runs",
            type=int,
            help="Number of runs (how many times to repeat the experiment).")
        detail_group.add_argument("-q",
                                  "--num_queries",
                                  type=int,
                                  help="Number of queries in each run.")
        detail_group.add_argument("-u",
                                  "--user_model",
                                  help="Class implementing a user model.")
        detail_group.add_argument(
            "-v",
            "--user_model_args",
            help="Arguments for initializing the user model.")
        # the retrieval system maintains ranking functions, accepts queries and
        # generates result lists, and in return receives user clicks to learn
        # from
        detail_group.add_argument(
            "-s",
            "--system",
            help="Which system to use (e.g., pairwise, listwise).")
        detail_group.add_argument("-a",
                                  "--system_args",
                                  help="Arguments for "
                                  "the system (comparison method, learning "
                                  "algorithm and parameters...).")
        detail_group.add_argument(
            "-o",
            "--output_dir",
            help="(Empty) directory for storing output generated by this"
            " experiment. Subdirectory for different folds will be generated"
            "automatically.")
        detail_group.add_argument("--output_dir_overwrite", default="False")
        detail_group.add_argument(
            "-p",
            "--output_prefix",
            help="Prefix to be added to output filenames, e.g., the name of "
            "the data set, fold, etc. Output files will be stored as "
            "OUTPUT_DIR/PREFIX-RUN_ID.txt")
        detail_group.add_argument("-e",
                                  "--experimenter",
                                  help="Experimenter type.")
        detail_group.add_argument("-sd", "--seed", type=int)
        # run the parser
        if args_str:
            args = parser.parse_known_args(args_str.split())[0]
        else:
            args = parser.parse_known_args()[0]

        # determine whether to use config file or detailed args
        self.experiment_args = None
        self.args_file = args.file
        if args.file:
            config_file = open(args.file)
            self.experiment_args = yaml.load(config_file)
            config_file.close()
            # overwrite with command-line options if given
            for arg, value in vars(args).items():
                if value:
                    self.experiment_args[arg] = value
        else:
            self.experiment_args = vars(args)

        # workaround - check if we have all the arguments needed
        if not ("training_queries" in self.experiment_args
                and "test_queries" in self.experiment_args and "feature_count"
                in self.experiment_args and "num_runs" in self.experiment_args
                and "num_queries" in self.experiment_args
                and "user_model" in self.experiment_args and "user_model_args"
                in self.experiment_args and "system" in self.experiment_args
                and "system_args" in self.experiment_args
                and "output_dir" in self.experiment_args):
            parser.print_help()
            sys.exit("Missing required arguments, please check the program"
                     " arguments or configuration file. %s" %
                     self.experiment_args)

        # set default values for optional arguments
        if "query_sampling_method" not in self.experiment_args:
            self.experiment_args["query_sampling_method"] = "random"
        if "output_dir_overwrite" not in self.experiment_args:
            self.experiment_args["output_dir_overwrite"] = False
        if "experimenter" not in self.experiment_args:
            self.experiment_args["experimenter"] = \
                "experiment.LearningExperiment.LearningExperiment"
        if "evaluation" not in self.experiment_args:
            self.experiment_args["evaluation"] = "evaluation.NdcgEval"
        if "processes" not in self.experiment_args:
            self.experiment_args["processes"] = 0
        if "seed" not in self.experiment_args:
            np.random.seed(42)
        else:
            np.random.seed(self.experiment_args['seed'])

        # locate or create directory for the current fold
        if not os.path.exists(self.experiment_args["output_dir"]):
            os.makedirs(self.experiment_args["output_dir"])
        elif not (self.experiment_args["output_dir_overwrite"]) and \
                os.listdir(self.experiment_args["output_dir"]):
            # make sure the output directory is empty
            raise Exception(
                "Output dir %s is not an empty directory. Please"
                " use a different directory, or move contents out of the way."
                % self.experiment_args["output_dir"])

        logging.basicConfig(
            format='%(levelname)s %(module)s %(asctime)s: %(message)s',
            level=logging.INFO)
        logging.info("Arguments: %s" % self.experiment_args)

        # Printing out arguments that are used in execution
        for k, v in sorted(self.experiment_args.items()):
            logging.info("\t%s: %s" % (k, v))
        config_bk = os.path.join(self.experiment_args["output_dir"],
                                 "config_bk.yml")
        logging.info("Backing up configuration to: %s" % config_bk)
        with open(config_bk, "w") as config_bk_file:
            yaml.dump(self.experiment_args,
                      config_bk_file,
                      default_flow_style=False)

        # load training and test queries
        training_file = self.experiment_args["training_queries"]
        test_file = self.experiment_args["test_queries"]
        self.feature_count = self.experiment_args["feature_count"]
        logging.info("Loading training data: %s " % training_file)
        self.training_queries = load_queries(training_file, self.feature_count)
        logging.info("... found %d queries." %
                     self.training_queries.get_size())
        logging.info("Loading test data: %s " % test_file)
        self.test_queries = load_queries(test_file, self.feature_count)
        logging.info("... found %d queries." % self.test_queries.get_size())

        # initialize and run the experiment num_run times
        self.num_runs = self.experiment_args["num_runs"]
        self.output_dir = self.experiment_args["output_dir"]
        self.output_prefix = self.experiment_args["output_prefix"]
        self.experimenter = get_class(self.experiment_args["experimenter"])
示例#4
0
    def __init__(self):
        # parse arguments
        parser = argparse.ArgumentParser(description="""Meta experiment""")

        file_group = parser.add_argument_group("FILE")
        file_group.add_argument("-f", "--file", help="Filename of the config "
                                                     "file from which the experiment details"
                                                     " should be read.")
        # option 2: specify all experiment details as arguments
        detail_group = parser.add_argument_group("DETAILS")
        detail_group.add_argument("-p", "--platform", help="Specify "
                                                           "'local' or 'celery'")
        detail_group.add_argument('--data', help="Data in the following"
                                                 "format: trainfile,testfile,d,r such that "
                                                 "a data file can be found in "
                                                 "datadir/trainfile/Fold1/train.txt",
                                  type=str, nargs="+")
        detail_group.add_argument('--um', nargs="+")
        detail_group.add_argument('--uma', help="",
                                  type=str, nargs="+")
        detail_group.add_argument('--analysis', nargs="*")
        detail_group.add_argument('--data_dir')
        detail_group.add_argument('--output_base')
        detail_group.add_argument('--experiment_name')
        detail_group.add_argument("-r", "--rerun", action="store_true",
                                  help="Rerun last experiment.",
                                  default=False)
        detail_group.add_argument("--queue_name", type=str)

        args = parser.parse_known_args()[0]

        logging.basicConfig(format='%(asctime)s %(module)s: %(message)s',
                            level=logging.INFO)

        # determine whether to use config file or detailed args
        self.experiment_args = None
        if args.file:
            config_file = open(args.file)
            config = yaml.load(config_file, Loader=yaml.Loader)
            self.experiment_args = config
            config_file.close()
            try:
                self.meta_args = vars(parser.parse_known_args(
                    self.experiment_args["meta"].split())[0])
            except:
                parser.error("Please make sure there is a 'meta' section "
                             "present in the config file")
            # overwrite with command-line options if given
            for arg, value in vars(args).items():
                if value:
                    self.meta_args[arg] = value
        else:
            self.meta_args = vars(args)

        for k in list(self.meta_args.keys()) + ["meta"]:
            if k in self.experiment_args:
                del self.experiment_args[k]

        if self.meta_args["platform"] == "local":
            self.run = self.run_local
        elif self.meta_args["platform"] == "conf":
            self.run = self.run_conf
        else:
            parser.error("Please specify a valid platform.")

        usermodels = {}
        for umstr in self.meta_args["uma"]:
            parts = umstr.split(',')
            um, car = parts[:2]
            car = int(car)
            if len(parts) != car * 2 + 2:
                parser.error("Error in uma")
            p_click = ", ".join(parts[2:2 + car])
            p_stop = ", ".join(parts[2 + car:])
            if not um in usermodels:
                usermodels[um] = {}
            usermodels[um][car] = "--p_click %s --p_stop %s" % \
                                  (p_click, p_stop)

        basedir = os.path.join(os.path.abspath(self.meta_args["output_base"]),
                               self.meta_args["experiment_name"])

        i = 0
        while os.path.exists(os.path.join(basedir, "v%03d" % i)):
            i += 1
        if i > 0 and self.meta_args["rerun"]:
            i -= 1
        logging.info("Running experiment v%03d" % i)
        basedir = os.path.join(basedir, "v%03d" % i)
        if not os.path.exists(basedir):
            os.makedirs(basedir)
        logging.info("Results appear in %s" % basedir)

        config_bk = os.path.join(basedir, "meta_config_bk.yml")
        with open(config_bk, "w") as config_bk_file:
            yaml.dump(self.meta_args,
                      config_bk_file,
                      default_flow_style=False,
                      Dumper=yaml.Dumper)

        skip = 0
        self.configurations = []
        for run_id in range(self.experiment_args["num_runs"]):
            for um in self.meta_args["um"]:
                for dstr in self.meta_args["data"]:
                    dparts = dstr.split(',')
                    data, d, r = dparts[:3]
                    d, r = int(d), int(r)
                    user_model_args = usermodels[um][r]
                    folds = glob.glob(os.path.join(
                        os.path.abspath(self.meta_args["data_dir"]),
                        data,
                        "Fold*"))
                    for fold in folds:
                        args = self.experiment_args.copy()
                        if len(dparts) > 3:
                            selected_weights = ",".join(dparts[3:])
                            args["system_args"] += " --selected_weights " + \
                                                   selected_weights
                        args["data_dir"] = self.meta_args["data_dir"]
                        args["fold_dir"] = fold
                        #            args["run_id"] = run_id
                        args["feature_count"] = d
                        args["user_model_args"] = user_model_args
                        args["output_dir"] = os.path.join(basedir,
                                                          'output',
                                                          um,
                                                          data,
                                                          os.path.basename(fold))
                        args["output_prefix"] = os.path.basename(fold)
                        args["run_id"] = run_id
                        if self.meta_args["rerun"]:
                            if not os.path.exists(os.path.join(
                                    args["output_dir"],
                                    "%s-%d.txt.gz" %
                                    (args["output_prefix"],
                                     run_id))):
                                self.configurations.append(args)
                            else:
                                skip += 1
                        else:
                            self.configurations.append(args)
        logging.info("Created %d configurations (and %d skipped)" % (
            len(self.configurations),
            skip))
        self.analytics = []
        if self.meta_args["analysis"]:
            for analyse in self.meta_args["analysis"]:
                aclass = get_class(analyse)
                a = aclass(basedir)
                self.analytics.append(a)