Пример #1
0
    def __init__(self, config):
        self._config = config
        self._experiments = {}

        status_path = config.get_files().get_status_path()
        study_xml = None
        if status_path.exists():
            study_xml = xml_load_from_path(status_path)

        if study_xml is None:
            create_study_output(config)
            study_xml = xml_load_from_path(status_path)

        # if the output xml has one element it's check, 0 nothing has ran
        # if there's more than that then the experiment ran and user is evaluating
        if study_xml is None or len(study_xml.getchildren()) <= 1:
            create_study_output(config)
            study_xml = xml_load_from_path(status_path)

        for exp in study_xml.xpath('//experiment'):
            # keep track of experiment's name
            exp_name = Files.get_exp_name(exp.attrib['count'])

            self._experiments[exp_name] = ExperimentData(exp)

        model = single_xpath(study_xml, '/study/config/splitter/model')
        if model.text == 'kcv':
            self._kcv_count = int(model.attrib['count'])
        else:
            self._kcv_count = 0
        time = single_xpath(study_xml, 'completed_at').text
        time_obj = datetime.strptime(time, '%Y-%m-%d %H:%M:%S.%f')
        self._timestamp = time_obj.strftime("%Y%m%d_%H%M%S")
Пример #2
0
    def process_params(self, param_elements):
        param_list = []
        val_list = []

        # Maybe could be done with an ugly list comprehension
        for param_xml in param_elements:
            name_elem = single_xpath(param_xml, 'name')
            param_list.append(name_elem.text)
            val_elem = single_xpath(param_xml, 'value')
            val_list.append(val_elem.text)

        self._params = param_list
        self._vals = val_list
Пример #3
0
    def fix_list_length(self):
        config = self._config
        rerank_size_elem = single_xpath(
            config._xml_input,
            '/librec-auto/rerank/script/param[@name="max_len"]')

        if rerank_size_elem is None:
            return
        else:
            list_size_elem = single_xpath(config._xml_input,
                                          "/librec-auto/metric/list-size")
            list_size_elem.text = rerank_size_elem.text
            config.write_exp_configs()
Пример #4
0
    def dry_run(self, config):
        self._files = config.get_files()
        self._config = config

        files = config.get_files()
        if files.get_exp_count() > 0:
            for i in range(0, files.get_exp_count()):
                sub_paths = files.get_exp_paths(i)
                script_elem = single_xpath(sub_paths.get_study_conf(),
                                           '/librec-auto/rerank/script')
                param_spec = create_param_spec(script_elem)
                script_path = get_script_path(script_elem, 'rerank')
                ref_path = sub_paths.get_ref_exp_name()
                result_path = sub_paths.get_path('result')

                original_path = self.find_original_results(result_path, script_path, sub_paths)

                print(
                    f'librec-auto (DR): Running re-ranking command {self} for {sub_paths.exp_name}'
                )

                proc_spec = [
                                sys.executable,
                                script_path.as_posix(),
                                self._config.get_files().get_config_file_path().name,
                                original_path.absolute().as_posix(),
                                sub_paths.get_path('result').absolute().as_posix()
                            ] + param_spec
                print_process_cli(proc_spec, str(self._config.get_files().get_study_path().absolute()))
Пример #5
0
 def cross_validation(self):
     model_elem = single_xpath(self._xml_input,
                               '/librec-auto/splitter/model')
     if model_elem.text == 'kcv':
         return int(model_elem.get('count'))
     else:
         return 1
Пример #6
0
 def set_data_path(self, config_xml):
     data_dir_elem = single_xpath(config_xml, '/librec-auto/data/data-dir')
     if data_dir_elem is None:
         logging.warning(
             "Configuration file missing data-dir element. Assuming 'data'."
         )
     else:
         self._data_dir_path = data_dir_elem.text
Пример #7
0
def load_item_features(config, data_path):
    item_feature_file = single_xpath(
        config.get_xml(), '/librec-auto/features/item-feature-file').text
    item_feature_path = data_path / item_feature_file

    if not item_feature_path.exists():
        print("Cannot locate item features. Path: " + item_feature_path)
        return None

    item_feature_df = pd.read_csv(item_feature_path,
                                  names=['itemid', 'feature', 'value'])
    item_feature_df.set_index('itemid', inplace=True)
    return item_feature_df
def main():
    args = read_args()
    config = read_config_file(args['conf'], '.')

    original_results_path = Path(args['original'])
    result_files = enumerate_results(original_results_path)

    dest_results_path = Path(args['result'])

    data_dir = single_xpath(config.get_xml(), '/librec-auto/data/data-dir').text

    data_path = Path(data_dir)
    data_path = data_path.resolve()

    item_feature_df = load_item_features(config, data_path)

    protected = single_xpath(config.get_xml(), '/librec-auto/metric/protected-feature').text


    if item_feature_df is None:
        exit(-1)

    alpha = float(args['alpha'])
    max_len = int(args['max_len'])
    binary = args['binary'] == 'True'
    # protected = str(args['protected'])

    helper = set_helper(alpha, max_len, binary, protected, item_feature_df)

    for file_path in result_files:

        results_df = pd.read_csv(file_path, names=['userid', 'itemid', 'rating'])

        fair = generate_fairstar(helper)
        reranked_df = rerank(results_df, fair, helper)

        output_reranked(reranked_df, dest_results_path, file_path)
Пример #9
0
    def dry_run(self, config):
        self._config = config
        print(f'librec-auto (DR): Running post command {self}')

        post_elems = config.get_xml().xpath(self.POST_ELEM_XPATH)

        for post_elem in post_elems:
            param_spec = utils.create_param_spec(post_elem)
            if single_xpath(post_elem,
                            "//param[@name='password']") is not None:
                param_spec = param_spec + ['--password=<password hidden>']
            script_path = utils.get_script_path(post_elem, 'post')

            print(f'\tPost script: {script_path}')
            print(f'\tParameters: {param_spec}')
Пример #10
0
def main():
    args = read_args()
    config = read_config_file(args['conf'], '.')

    original_results_path = Path(args['original'])
    result_files = enumerate_results(original_results_path)

    dest_results_path = Path(args['result'])

    data_dir = single_xpath(config.get_xml(),
                            '/librec-auto/data/data-dir').text

    data_path = Path(data_dir)
    data_path = data_path.resolve()

    item_feature_df = load_item_features(config, data_path)
    if item_feature_df is None:
        exit(-1)

    # item_helper = set_item_helper(item_feature_df)

    # rerank_helper = set_rerank_helper(args, config, item_helper)
    rerank_helper = Rerank_Helper()
    rerank_helper.set_rerank_helper(args, config, item_feature_df)

    split_path = data_path / 'split'
    pat = re.compile(RESULT_FILE_PATTERN)

    method = args['method']

    p = []

    for file_path in result_files:
        p1 = multiprocessing.Process(target=execute,
                                     args=(rerank_helper, pat, file_path,
                                           split_path, dest_results_path))
        p.append(p1)
        p1.start()

    for p1 in p:
        p1.join()
Пример #11
0
    def dry_run(self, config):
        self._files = config.get_files()
        self._config = config

        files = config.get_files()
        if files.get_exp_count() > 0:
            for i in range(0, files.get_exp_count()):
                sub_path = files.get_exp_paths(i)
                script_elem = single_xpath(sub_path.get_study_conf(),
                                           '/librec-auto/rerank/script')
                param_spec = create_param_spec(script_elem)
                script_path = get_script_path(script_elem, 'rerank')
                ref_path = sub_path.get_ref_exp_name()

                print(
                    f'librec-auto (DR): Running re-ranking command {self} for {sub_path.exp_name}'
                )
                print(f'\tRe-rank script: {script_path}')
                print(f'\tParameters: {param_spec}')
                if ref_path:
                    print(f'\tResults from: {ref_path}')
Пример #12
0
    def __init__(self, sub_paths):
        self._subpaths = sub_paths
        status_path = self._subpaths.get_path('status')

        if status_path.exists():
            self._name = sub_paths.exp_name
            self._status_xml = xml_load_from_path(status_path)
            self._message = single_xpath(self._status_xml,
                                         '/librec-auto-status/message').text

            if self._subpaths.get_path('log').exists():
                self._log = LogFile(self._subpaths)
            else:
                self._log = None

            params = self._status_xml.xpath('//param')
            if params != None:
                self.process_params(params)
            else:
                self._params = []
                self.m_vals = []
Пример #13
0
    def dry_run(self, config):
        self._config = config
        print(f'librec-auto (DR): Running post command {self}')

        post_elems = config.get_xml().xpath(self.POST_ELEM_XPATH)

        for post_elem in post_elems:
            param_spec = utils.create_param_spec(post_elem)
            if single_xpath(post_elem,
                            "//param[@name='password']") is not None:
                param_spec = param_spec + ['--password=<password hidden>']
            script_path = utils.get_script_path(post_elem, 'post')

            proc_spec = [
                sys.executable,
                script_path.absolute().as_posix(),
                self._config.get_files().get_config_file_path().name
            ] + param_spec

            print_process_cli(
                proc_spec,
                self._config.get_files().get_study_path().absolute)
Пример #14
0
 def handle_password(self, post_elem, config, param_spec):
     if single_xpath(post_elem, "param[@name='password']") is not None:
         val = config.get_key_password()
         if val:
             param_spec.append(f'--password={val}')
     return param_spec
Пример #15
0
def setup_commands(args: dict, config: ConfigCmd):
    action = args['action']
    purge_no_ask = args['quiet']
    alg_lang = execution_platform(config, 'alg')
    met_lang = execution_platform(config, 'metric')
    # Create flags for optional steps
    rerank_flag = config.has_rerank()
    post_flag = config.has_post()

    # Flag to use/avoid check
    # if true, user specified don't run check, else, run check.
    no_check_flag = args['no_check']

    # Set the password in the configuration if we have it
    if args['key_password']:
        config.set_key_password(args['key_password'])

    # Purge files (possibly) from splits and subexperiments
    if action == 'purge':
        return PurgeCmd(purge_type(args), no_ask=purge_no_ask)

    # Shows the status of the experiment
    if action == 'status':
        return StatusCmd()

    # Perform (only) post-processing on results
    if action == 'post' and post_flag:
        return PostCmd()
    # No post scripts available
    if action == 'post' and not post_flag:
        raise InvalidCommand(
            action,
            "No post-processing scripts available for \"post\" command")

    # Perform re-ranking on results, followed by evaluation and post-processing
    if action == 'rerank' and rerank_flag:  # Runs a reranking script on the python side
        cmd1 = RerankCmd()
        cmd2 = build_librec_commands('eval', args, config)
        cmd3 = EvalCmd(args, config)  # python-side eval
        cmd = SequenceCmd([cmd1, cmd2, cmd3])

        bracketed_cmd = bracket_sequence('rerank', args, config, cmd)
        return bracketed_cmd
    # No re-ranker available
    if action == 'rerank' and not rerank_flag:
        raise InvalidCommand(
            action, "No re-ranker scripts available for \"rerank\" command.")

    # LibRec actions
    # re-run splits only
    if action == 'split':
        cmd = SequenceCmd([build_librec_commands('split', args, config)])
        bracketed_cmd = bracket_sequence('split', args, config, cmd)
        return bracketed_cmd

    # re-run experiment
    if action == 'bbo':
        cmd1 = PurgeCmd('results', no_ask=purge_no_ask)
        cmd2 = SetupCmd(False)
        cmd3 = [cmd1, cmd2]
        if config.has_alg_script():
            cmd_store = build_alg_commands(args, config, BBO=200)
        else:
            cmd_store = build_librec_commands('full', args, config, BBO=200)
        store_post = [PostCmd() for _ in range(len(cmd_store))]

        init_cmds = [cmd1, cmd2]
        check_cmds = []
        if not no_check_flag:
            # check_cmds = [build_librec_commands('check',args,config), CheckCmd()]
            librec_check = build_librec_commands('check',
                                                 args,
                                                 config,
                                                 BBO=200)
            check_cmds = [librec_check[0], CheckCmd()]

        exec_cmds = build_librec_commands('full', args, config, BBO=200)
        exec_cmds = [
            SequenceCmd([exec_cmds[i]]) for i in range(len(exec_cmds))
        ]

        if rerank_flag:
            # cmd.append(RerankCmd())
            # cmd.append(build_exp_commands('eval', args, config))
            raise UnsupportedFeatureException(
                "Optimization",
                "Optimization is not currently supported with reranking")

        final_cmds = []

        if post_flag:
            final_cmds.append(PostCmd())
        else:
            final_cmds.append(CleanupCmd())

        # cmd = init_cmds + check_cmds + exec_cmds + final_cmds

        cmd = init_cmds + exec_cmds + final_cmds

        return cmd

    # re-run experiment and continue
    if (action == 'run' or action == 'show') and not config.has_alg_script():
        cmd1 = build_librec_commands('full', args, config)
        add_eval = maybe_add_eval(config=config)
        if add_eval:
            # cmd2 = EvalCmd(args, config)  # python-side eval
            cmd2 = build_eval_commands(args, config, met_lang)
            cmd = SequenceCmd([cmd1, cmd2])
        else:
            cmd = SequenceCmd([cmd1])
        if rerank_flag:
            cmd.add_command(RerankCmd())
            cmd.add_command(build_librec_commands('eval', args, config))
        # bracketed_cmd = bracket_sequence('results', args, config, cmd)
        bracketed_cmd = bracket_sequence('all', args, config, cmd)
        return bracketed_cmd

    if (action == 'run' or action == 'show') and config.has_alg_script():
        # if met_lang == 'system':
        cmd1 = build_alg_commands(args, config)
        add_eval = maybe_add_eval(config=config)
        if add_eval:
            cmd2 = EvalCmd(args, config)  # python-side eval
            cmd = SequenceCmd([cmd1, cmd2])
        else:
            cmd = SequenceCmd([cmd1])
        if rerank_flag:
            cmd.add_command(RerankCmd())
            cmd.add_command(build_librec_commands('eval', args, config))
        # bracketed_cmd = bracket_sequence('results', args, config, cmd)
        bracketed_cmd = bracket_sequence('all', args, config, cmd)
        return bracketed_cmd

    # eval-only
    if action == 'eval':
        if single_xpath(config.get_xml(), '/librec-auto/optimize') is not None:
            raise InvalidConfiguration(
                "Eval-only not currently supported with Bayesian optimization."
            )

        # cmd1 = PurgeCmd('post', no_ask=purge_no_ask)
        # cmd2 = SetupCmd()
        cmd1 = build_librec_commands('eval', args, config)
        cmd2 = EvalCmd(args, config)  # python-side eval
        cmd = SequenceCmd([cmd1, cmd2])
        bracketed_cmd = bracket_sequence('post', args, config, cmd)
        return bracketed_cmd

    # check setup of experiment
    # We don't check on algorithm scripts
    if action == 'check':
        cmd1 = build_librec_commands('check', args, config)
        cmd2 = CheckCmd()
        cmd = SequenceCmd([cmd1, cmd2])
        bracketed_cmd = bracket_sequence('none', args, config, cmd)
        return bracketed_cmd
Пример #16
0
 def setup_bbo(self):
     opt_elem = single_xpath(self._xml_input, '/librec-auto/optimize')
     if opt_elem is None:
         self._bbo_steps = 0
     else:
         self._bbo_steps = int(single_xpath(opt_elem,'iterations').text)
Пример #17
0
 def has_alg_script(self):
     '''
     Determine if <alg> element in configuration file has script.
     '''
     alg_script_elem = single_xpath(self._xml_input, '/librec-auto/alg/script')
     return (alg_script_elem is not None)
Пример #18
0
 def has_metric_script(self):
     '''
     Determine if <metric> element in configuration file has script.
     '''
     metric_script_element = single_xpath(self._xml_input, '/librec-auto/metric/script')
     return (metric_script_element is not None)
Пример #19
0
    def execute(self, config: ConfigCmd):
        self._status = Cmd.STATUS_INPROC
        files = config.get_files()
        pwd = files.get_study_path()
        config_xml = config._xml_input
        config_elements = config_xml.getchildren()

        output_path = config.get_files().get_study_path()
        output_xml_path = str(output_path / "output.xml")
        study_ran = Path(output_xml_path).exists()
        check_output_xml(output_xml_path)
        if study_ran:
            os.remove(output_xml_path)

        # check should  be the first thing writing to an output.xml file
        output_tree = etree.Element("study")
        # clear the check elements from before, if present
        check_element = output_tree.find('check')
        if check_element is not None:
            output_tree.remove(check_element)

        # check all paths have write access.
        for func in dir(files):
            if re.match(r'get_.*path$', func):
                getpath = getattr(files, func)
                if func == 'get_status_path' or func == 'get_post_path' or func == 'get_split_path':
                    continue
                if not os.access(getpath(), os.W_OK):
                    raise InvalidConfiguration(
                        getpath(), f"Write access not granted {func}")

        # check all necessary elements are in config
        curr_elem = [e.tag for e in config_elements]
        necc_elem = {
            'data': 'Data section',
            'splitter': 'Splitter section',
            'alg': 'Algorithm section',
            'metric': 'Metric section'
        }
        for elem in necc_elem.keys():
            if elem not in curr_elem:
                raise InvalidConfiguration(
                    necc_elem[elem],
                    f"{necc_elem[elem]} missing in configuration file.")

        # checking library
        library = single_xpath(config_xml, '/librec-auto/library')
        if library.attrib['src'] == "system":
            lib_path = files.get_lib_path() / library.text
        else:
            lib_path = pwd / library.attrib['src'] / library.text
        if not lib_path.exists():
            raise InvalidConfiguration(lib_path,
                                       "Library not found at give path.")

        # Checking data.
        data_dir = single_xpath(config_xml, '/librec-auto/data/data-dir')
        # Test to see how many data directories were given.
        num_data_dir_test = config_xml.xpath('/librec-auto/data/data-dir')
        if len(num_data_dir_test) > 1:
            raise InvalidConfiguration("Data Directory",
                                       "More than one data file found.")
        # Checking path to data directory
        data_dir_path = Path(pwd / data_dir.text)
        data_file = single_xpath(config_xml, '/librec-auto/data/data-file')
        data_file_path = Path(data_dir_path / data_file.text)
        if not data_file_path.exists():
            raise InvalidConfiguration(str(data_file_path),
                                       "Data file not found at given path.")

        # checking script paths/files exist and that scripts are in approved locations
        for elem in config_elements:
            script_element = elem.findall('script')
            # findall returns list, check for items.
            if script_element:
                # Iterate over scripts.
                for se in script_element:
                    if se.attrib['src'] == "system":
                        if elem.tag == 'metric':
                            script_path = files.get_global_path(
                            ) / 'librec_auto' / 'core' / 'cmd' / 'eval'
                        elif elem.tag == 'post':
                            script_path = files.get_global_path(
                            ) / 'librec_auto' / 'core' / 'cmd' / 'post'
                        elif elem.tag == 'rerank':
                            script_path = files.get_global_path(
                            ) / 'librec_auto' / 'core' / 'cmd' / 'rerank'
                        elif elem.tag == 'alg':
                            script_path = files.get_global_path(
                            ) / 'librec_auto' / 'core' / 'cmd' / 'alg'
                        else:
                            raise InvalidConfiguration(
                                elem.tag,
                                f"Scripts not allowed in {elem.tag} section.")
                    else:
                        script_path = Path(se.attrib['src'])
                    script_name = se.find('script-name')
                    script_path = script_path / script_name.text
                    if not script_path.exists():
                        raise InvalidConfiguration(
                            str(script_path),
                            f'{script_name.text} not found in given path.')
            # else: if there aren't script elements do nothing, for now

        if 'optimize' in curr_elem:
            alg = single_xpath(config_xml, '/librec-auto/alg')
            if alg is not None:
                for elem in alg:
                    # parameters being optimized should have children, upper and lower
                    if elem.getchildren():
                        children = [e.tag for e in elem.iterchildren()]
                        if 'value' in children:  # impossible case: librec-auto setup catches this first.
                            raise InvalidConfiguration(
                                'Optimization',
                                'Value tags not allowed in optimize element')
                        else:
                            if 'lower' and 'upper' not in children:
                                raise InvalidConfiguration(
                                    'Optimization',
                                    f'Lower and upper tags missing in {elem.tag}'
                                )
                    else:
                        # for now continue, should add check to make sure value
                        # from reference xml and config xml are same type.
                        pass

        # create filepath attribute for errors as src
        # if the compiler makes it to here without raising an error, then there are no errors
        if not study_ran:  # if the output file doesn't exist
            check_tree = etree.SubElement(output_tree, "check")
            message_element = etree.SubElement(check_tree, "message")
            message_element.text = "No errors found in configuration file syntax."
        else:  # if it does
            check_tree = etree.Element("check")
            message_element = etree.SubElement(check_tree, "message")
            message_element.text = "No errors found in configuration file syntax."
            output_tree.insert(2, check_tree)

        # reading the Java logs
        # check command shouldn't care about librec.properties file not found (unless run was ran)
        for i in range(0, config.get_sub_exp_count()):
            exp_path = config.get_files().get_exp_paths(i)
            log_object = LogFile(exp_path, study_ran)
            # src: filepath
            check_tree = output_tree.find('check')
            if check_tree is not None:
                if len(log_object._err_msgs.keys()) != 0:
                    # dict-list comprehension to filter out ignorable errors
                    temp_dict = {
                        k: [(line, m) for line, m in log_object._err_msgs[k]
                            if not self.is_ignorable_error(m)]
                        for k in log_object._err_msgs.keys()
                    }
                    # filter out empty lists
                    temp_dict = {k: v for k, v in temp_dict.items() if v}
                    # iterate over filtered dictionary
                    if len(temp_dict.keys()) != 0:
                        for error in temp_dict.keys():
                            for line_number, message in temp_dict[error]:
                                message_element = etree.SubElement(
                                    check_tree, "message", {
                                        'src': str(log_object.get_log_path()),
                                        'logline': str(line_number),
                                        'exp_num': str(i)
                                    })
                                message_element.text = message.strip('\n')
                    else:
                        message_element = etree.SubElement(
                            check_tree, "message",
                            {'src': str(log_object.get_log_path())})
                        message_element.text = f"No errors found in experiment {i} log."
                else:
                    message_element = etree.SubElement(
                        check_tree, "message",
                        {'src': str(log_object.get_log_path())})
                    message_element.text = f"No errors found in experiment {i} log."

        output_tree.getroottree().write(output_xml_path, pretty_print=True)

        parser = etree.XMLParser(remove_blank_text=True)
        tree = etree.parse(output_xml_path, parser)
        tree.write(output_xml_path, encoding='utf-8', pretty_print=True)

        self._status = Cmd.STATUS_COMPLETE
Пример #20
0
def enumerate_results(result_path):
    files = os.listdir(result_path)
    pat = re.compile(RESULT_FILE_PATTERN)
    return [file for file in files if pat.match(file)]


if __name__ == '__main__':
    args = read_args()
    #print(args)
    config = read_config_file(args['conf'], ".")
    result_files = enumerate_results(args['original'])

    split_path = config.get_files().get_split_path()
    # split_names = os.listdir(split_path)

    data_dir = single_xpath(config.get_xml(), '/librec-auto/data/data-dir').text
    item_feature_file = single_xpath(
        config.get_xml(), '/librec-auto/features/item-feature-file').text
    protected = single_xpath(config.get_xml(),
                             '/librec-auto/metric/protected-feature').text

    item_feature_path = Path(data_dir) / item_feature_file

    item_feature_df = None

    if not item_feature_path.exists():
        print("Cannot locate item features. Path: " + item_feature_path)
        exit(-1)
    else:
        item_feature_df = pd.read_csv(item_feature_path,
                                      names=['itemid', 'feature', 'value'])