Exemplo n.º 1
0
 def __check_config(config):
     """Checks config parameters and sets default values."""
     try:
         if KEY_COLLECTION not in config:
             raise Exception(KEY_COLLECTION + " is missing")
         if KEY_LOWERCASE not in config:
             config[KEY_LOWERCASE] = True
     except Exception as e:
         PLOGGER.error("Error in config file: ", e)
         sys.exit(1)
Exemplo n.º 2
0
    def check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if config.get("index_name", None) is None:
                raise Exception("index_name is missing")

            # Checks first pass parameters
            if config.get("first_pass", None) is None:
                config["first_pass"] = {}
            if config["first_pass"].get("1st_num_docs", None) is None:
                config["first_pass"]["1st_num_docs"] = 1000
            if config["first_pass"].get("field", None) is None:
                config["first_pass"]["field"] = Elastic.FIELD_CATCHALL
            if config["first_pass"].get("fields_return", None) is None:
                config["first_pass"]["fields_return"] = ""
            if config["first_pass"].get("model", None) is None:
                config["first_pass"]["model"] = Elastic.BM25

            if config.get("start", None) is None:
                config["start"] = 0
            if config.get("num_docs", None) is None:
                config["num_docs"] = 100

            if config.get("model", None) in Retrieval.LM_MODELS:
                if config.get("smoothing_method", None) is None:
                    config["smoothing_method"] = ScorerLM.DIRICHLET
                if config.get("smoothing_param", None) is None:
                    if config["smoothing_method"] == ScorerLM.DIRICHLET:
                        config["smoothing_param"] = 2000
                    elif config["smoothing_method"] == ScorerLM.JM:
                        config["smoothing_param"] = 0.1
                    else:
                        raise Exception("Smoothing method is not supported.")

            if config.get("model", None) == "lm":
                if config.get("fields", None) is None:
                    config["fields"] = Elastic.FIELD_CATCHALL
                elif type(config["fields"]) != str:
                    raise Exception("Only a single field is required for LM.")
            if config.get("model", None) == "mlm":
                if config.get("fields", None) is None:
                    config["fields"] = {Elastic.FIELD_CATCHALL: 1}
                elif type(config["fields"]) != dict:
                    raise Exception(
                        "A dictionary of fields and their weights is required for MLM."
                    )
            if config.get("model", None) == "prms":
                if config.get("fields", None) is None:
                    config["fields"] = [Elastic.FIELD_CATCHALL]
                elif type(config["fields"]) != list:
                    raise Exception("A list of fields is required for PRMS.")
        except Exception as e:
            PLOGGER.error("Error in config file: ", e)
            sys.exit(1)
Exemplo n.º 3
0
    def rank_ens(self):
        """Ranks instances according to the learned LTR model

        :param n: length of n-gram
        :return: dictionary {(dbp_uri, fb_id):commonness, ..}
        """
        if self.__model is None:
            PLOGGER.error("LTR model is not defined.")

        inss = self.get_candidate_inss()
        ML({}).apply_model(inss, self.__model)
        return inss
Exemplo n.º 4
0
 def __check_config(config):
     """Checks params and set default values."""
     try:
         if KEY_COLLECTION not in config:
             raise Exception(KEY_COLLECTION + " is missing")
         if KEY_MAPPING_FILE not in config:
             raise Exception(KEY_MAPPING_FILE + " is missing")
         if not op.exists(config[KEY_MAPPING_FILE]):
             raise Exception("Mapping file path does not exist.")
     except Exception as e:
         PLOGGER.error("Error in config file: ", e)
         exit(1)
     return config
Exemplo n.º 5
0
    def add(self, doc_id, contents):
        """Adds a document or replaces the contents of an entire document."""
        # escaping keys for content
        c = {}
        for key, value in contents.items():
            c[self.__escape(key)] = value

        try:
            self.__collection.update({Mongo.ID_FIELD: self.__escape(doc_id)},
                                     {'$set': c},
                                     upsert=True)
        except Exception as e:
            PLOGGER.error("\nError (doc_id: " + str(doc_id) + ")\n" + str(e))
Exemplo n.º 6
0
def main(args):
    config = FileUtils.load_config(args.config)
    dbpedia_path = config.get("dbpedia_files_path", "")
    # Check DBpedia files
    PLOGGER.info("Checking needed DBpedia files under {}".format(dbpedia_path))
    for fname in [ENTITY_ABSTRACTS_FILE] + ENTITY_TYPES_FILES:
        if os.path.isfile(os.sep.join([dbpedia_path, fname])):
            PLOGGER.info("  - {}: OK".format(fname))
        else:
            PLOGGER.error("  - {}: Missing".format(fname))
            exit(1)

    indexer = IndexerDBpediaTypes(config)
    indexer.build_index(force=True)
Exemplo n.º 7
0
    def __init__(self, elastic, query, params):
        super(ScorerLM, self).__init__(elastic, query, params)
        self._field = params.get("fields", Elastic.FIELD_CATCHALL)
        self._smoothing_method = params.get("smoothing_method", self.DIRICHLET).lower()
        if self._smoothing_method == self.DIRICHLET:
            self._smoothing_param = params.get("smoothing_param", 2000)
        elif self._smoothing_method == ScorerLM.JM:
            self._smoothing_param = params.get("smoothing_param", 0.1)
        # self._smoothing_param = params.get("smoothing_param", None)
        else:
            PLOGGER.error(self._smoothing_method + " smoothing method is not supported!")
            sys.exit(0)

        self._tf = {}
Exemplo n.º 8
0
def main(args):
    config = FileUtils.load_config(args.config)
    if "_uri" not in config["index_name"]:
        PLOGGER.error("index name might not be correct, please check again!")
        exit(0)

    if "fields_file" not in config:
        fields_count = compute_field_counts()
    else:
        fields_count = json.load(config["fields_file"])

    indexer = IndexerDBpediaURI(config, fields_count)

    indexer.build()
    PLOGGER.info("Index build: " + config["index_name"])
Exemplo n.º 9
0
 def __check_config(config):
     """Checks config parameters and set default values."""
     must_have = [
         "model_file", "training_set", "ground_truth", "query_file"
     ]
     try:
         for i in range(0, 2):
             if must_have[i] not in config:
                 raise Exception(must_have[i] + "is not defined!")
         if config.get("gen_training_set", False):
             for i in range(2, 4):
                 if must_have[i] not in config:
                     raise Exception(must_have[i] + "is not defined!")
     except Exception as e:
         PLOGGER.error("Error in config file: ", e)
         exit(1)
Exemplo n.º 10
0
    def check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if config.get("index_name", None) is None:
                raise Exception("index_name is missing")

            # Checks first pass parameters
            if config.get("first_pass", None) is None:
                config["first_pass"] = {}
            if config["first_pass"].get("1st_num_docs", None) is None:
                config["first_pass"]["1st_num_docs"] = 1000
            if config["first_pass"].get("field", None) is None:
                config["first_pass"]["field"] = Elastic.FIELD_CATCHALL
            if config["first_pass"].get("fields_return", None) is None:
                config["first_pass"]["fields_return"] = ""
            if config["first_pass"].get("model", None) is None:
                config["first_pass"]["model"] = Elastic.BM25

            if config.get("start", None) is None:
                config["start"] = 0
            if config.get("num_docs", None) is None:
                config["num_docs"] = 100

            if config.get("model", None) in Retrieval.LM_MODELS:
                if config.get("smoothing_method", None) is None:
                    config["smoothing_method"] = ScorerLM.DIRICHLET
                if config.get("smoothing_param", None) is None:
                    if config["smoothing_method"] == ScorerLM.DIRICHLET:
                        config["smoothing_param"] = 2000
                    elif config["smoothing_method"] == ScorerLM.JM:
                        config["smoothing_param"] = 0.1
                    else:
                        raise Exception("Smoothing method is not supported.")

            if config.get("model", None) == "lm":
                if config.get("fields", None) is None:
                    config["fields"] = Elastic.FIELD_CATCHALL
            if config.get("model", None) == "mlm":
                if config.get("fields", None) is None:
                    config["fields"] = {"similar_entity_names": 0.2, "catchall": 0.8}
            if config.get("model", None) == "prms":
                if config.get("fields", None) is None:
                    config["fields"] = [Elastic.FIELD_CATCHALL]
        except Exception as e:
            PLOGGER.error("Error in config file: ", e)
            sys.exit(1)
Exemplo n.º 11
0
Arquivo: ml.py Projeto: zxlzr/nordlys
    def __check_config(config):
        """Checks config parameters and set default values."""
        try:
            # if "training_set" not in config:
            #     raise Exception("training_set is missing")
            # if "output_file" not in config:
            #     raise Exception("output_file is missing")
            if "cross_validation" in config:
                if "splits_file" not in config["cross_validation"]:
                    raise Exception("splits_file is missing")
                if "k" not in config["cross_validation"]:
                    config["cross_validation"]["k"] = 10
            # else:
            #     if "test_set" not in config:
            #         raise Exception("test_set is missing")

        except Exception as e:
            PLOGGER.error("Error in config file: ", e)
            exit(1)
Exemplo n.º 12
0
    def append_set(self, doc_id, field, value):
        """Adds a list of values to a set.
        If the field does not exist yet, it will be created.
        The value should be a list.

        :param doc_id: document id
        :param field: field
        :param value: list, a value to be appended to the current list
        """
        try:
            self.__collection.update(
                {Mongo.ID_FIELD: self.__escape(doc_id)},
                {'$addToSet': {
                    self.__escape(field): {
                        '$each': value
                    }
                }},
                upsert=True)
        except Exception as e:
            PLOGGER.error("\nError (doc_id: " + str(doc_id) + "), field: " +
                          field + "\n" + str(e))
Exemplo n.º 13
0
 def __check_config(config):
     """Checks params and set default values."""
     try:
         if KEY_COLLECTION not in config:
             raise Exception(KEY_COLLECTION + " is missing")
         if KEY_OPERATION not in config:
             config[KEY_OPERATION] = KEY_APPEND
         if KEY_PATH not in config:
             raise Exception(KEY_PATH + " is missing")
         if KEY_FILES not in config:
             raise Exception(KEY_FILES + " is missing")
         # reads all files
         existing_files = set()
         for subdir, dir, files in os.walk(config[KEY_PATH]):
             for file in files:
                 existing_files.add(os.path.join(subdir, file))
         for file in config[KEY_FILES]:
             dbpedia_file = config[KEY_PATH] + file[KEY_FILE_NAME]
             if dbpedia_file not in existing_files:
                 raise Exception(dbpedia_file + " does not exist.")
     except Exception as e:
         PLOGGER.error("Error in config file: ", e)
         sys.exit(1)