def __check_config(config): """Checks config parameters and sets default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_LOWERCASE not in config: config[KEY_LOWERCASE] = True except Exception as e: PLOGGER.error("Error in config file: ", e) sys.exit(1)
def check_config(config): """Checks config parameters and sets default values.""" try: if config.get("index_name", None) is None: raise Exception("index_name is missing") # Checks first pass parameters if config.get("first_pass", None) is None: config["first_pass"] = {} if config["first_pass"].get("1st_num_docs", None) is None: config["first_pass"]["1st_num_docs"] = 1000 if config["first_pass"].get("field", None) is None: config["first_pass"]["field"] = Elastic.FIELD_CATCHALL if config["first_pass"].get("fields_return", None) is None: config["first_pass"]["fields_return"] = "" if config["first_pass"].get("model", None) is None: config["first_pass"]["model"] = Elastic.BM25 if config.get("start", None) is None: config["start"] = 0 if config.get("num_docs", None) is None: config["num_docs"] = 100 if config.get("model", None) in Retrieval.LM_MODELS: if config.get("smoothing_method", None) is None: config["smoothing_method"] = ScorerLM.DIRICHLET if config.get("smoothing_param", None) is None: if config["smoothing_method"] == ScorerLM.DIRICHLET: config["smoothing_param"] = 2000 elif config["smoothing_method"] == ScorerLM.JM: config["smoothing_param"] = 0.1 else: raise Exception("Smoothing method is not supported.") if config.get("model", None) == "lm": if config.get("fields", None) is None: config["fields"] = Elastic.FIELD_CATCHALL elif type(config["fields"]) != str: raise Exception("Only a single field is required for LM.") if config.get("model", None) == "mlm": if config.get("fields", None) is None: config["fields"] = {Elastic.FIELD_CATCHALL: 1} elif type(config["fields"]) != dict: raise Exception( "A dictionary of fields and their weights is required for MLM." ) if config.get("model", None) == "prms": if config.get("fields", None) is None: config["fields"] = [Elastic.FIELD_CATCHALL] elif type(config["fields"]) != list: raise Exception("A list of fields is required for PRMS.") except Exception as e: PLOGGER.error("Error in config file: ", e) sys.exit(1)
def rank_ens(self): """Ranks instances according to the learned LTR model :param n: length of n-gram :return: dictionary {(dbp_uri, fb_id):commonness, ..} """ if self.__model is None: PLOGGER.error("LTR model is not defined.") inss = self.get_candidate_inss() ML({}).apply_model(inss, self.__model) return inss
def __check_config(config): """Checks params and set default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_MAPPING_FILE not in config: raise Exception(KEY_MAPPING_FILE + " is missing") if not op.exists(config[KEY_MAPPING_FILE]): raise Exception("Mapping file path does not exist.") except Exception as e: PLOGGER.error("Error in config file: ", e) exit(1) return config
def add(self, doc_id, contents): """Adds a document or replaces the contents of an entire document.""" # escaping keys for content c = {} for key, value in contents.items(): c[self.__escape(key)] = value try: self.__collection.update({Mongo.ID_FIELD: self.__escape(doc_id)}, {'$set': c}, upsert=True) except Exception as e: PLOGGER.error("\nError (doc_id: " + str(doc_id) + ")\n" + str(e))
def main(args): config = FileUtils.load_config(args.config) dbpedia_path = config.get("dbpedia_files_path", "") # Check DBpedia files PLOGGER.info("Checking needed DBpedia files under {}".format(dbpedia_path)) for fname in [ENTITY_ABSTRACTS_FILE] + ENTITY_TYPES_FILES: if os.path.isfile(os.sep.join([dbpedia_path, fname])): PLOGGER.info(" - {}: OK".format(fname)) else: PLOGGER.error(" - {}: Missing".format(fname)) exit(1) indexer = IndexerDBpediaTypes(config) indexer.build_index(force=True)
def __init__(self, elastic, query, params): super(ScorerLM, self).__init__(elastic, query, params) self._field = params.get("fields", Elastic.FIELD_CATCHALL) self._smoothing_method = params.get("smoothing_method", self.DIRICHLET).lower() if self._smoothing_method == self.DIRICHLET: self._smoothing_param = params.get("smoothing_param", 2000) elif self._smoothing_method == ScorerLM.JM: self._smoothing_param = params.get("smoothing_param", 0.1) # self._smoothing_param = params.get("smoothing_param", None) else: PLOGGER.error(self._smoothing_method + " smoothing method is not supported!") sys.exit(0) self._tf = {}
def main(args): config = FileUtils.load_config(args.config) if "_uri" not in config["index_name"]: PLOGGER.error("index name might not be correct, please check again!") exit(0) if "fields_file" not in config: fields_count = compute_field_counts() else: fields_count = json.load(config["fields_file"]) indexer = IndexerDBpediaURI(config, fields_count) indexer.build() PLOGGER.info("Index build: " + config["index_name"])
def __check_config(config): """Checks config parameters and set default values.""" must_have = [ "model_file", "training_set", "ground_truth", "query_file" ] try: for i in range(0, 2): if must_have[i] not in config: raise Exception(must_have[i] + "is not defined!") if config.get("gen_training_set", False): for i in range(2, 4): if must_have[i] not in config: raise Exception(must_have[i] + "is not defined!") except Exception as e: PLOGGER.error("Error in config file: ", e) exit(1)
def check_config(config): """Checks config parameters and sets default values.""" try: if config.get("index_name", None) is None: raise Exception("index_name is missing") # Checks first pass parameters if config.get("first_pass", None) is None: config["first_pass"] = {} if config["first_pass"].get("1st_num_docs", None) is None: config["first_pass"]["1st_num_docs"] = 1000 if config["first_pass"].get("field", None) is None: config["first_pass"]["field"] = Elastic.FIELD_CATCHALL if config["first_pass"].get("fields_return", None) is None: config["first_pass"]["fields_return"] = "" if config["first_pass"].get("model", None) is None: config["first_pass"]["model"] = Elastic.BM25 if config.get("start", None) is None: config["start"] = 0 if config.get("num_docs", None) is None: config["num_docs"] = 100 if config.get("model", None) in Retrieval.LM_MODELS: if config.get("smoothing_method", None) is None: config["smoothing_method"] = ScorerLM.DIRICHLET if config.get("smoothing_param", None) is None: if config["smoothing_method"] == ScorerLM.DIRICHLET: config["smoothing_param"] = 2000 elif config["smoothing_method"] == ScorerLM.JM: config["smoothing_param"] = 0.1 else: raise Exception("Smoothing method is not supported.") if config.get("model", None) == "lm": if config.get("fields", None) is None: config["fields"] = Elastic.FIELD_CATCHALL if config.get("model", None) == "mlm": if config.get("fields", None) is None: config["fields"] = {"similar_entity_names": 0.2, "catchall": 0.8} if config.get("model", None) == "prms": if config.get("fields", None) is None: config["fields"] = [Elastic.FIELD_CATCHALL] except Exception as e: PLOGGER.error("Error in config file: ", e) sys.exit(1)
def __check_config(config): """Checks config parameters and set default values.""" try: # if "training_set" not in config: # raise Exception("training_set is missing") # if "output_file" not in config: # raise Exception("output_file is missing") if "cross_validation" in config: if "splits_file" not in config["cross_validation"]: raise Exception("splits_file is missing") if "k" not in config["cross_validation"]: config["cross_validation"]["k"] = 10 # else: # if "test_set" not in config: # raise Exception("test_set is missing") except Exception as e: PLOGGER.error("Error in config file: ", e) exit(1)
def append_set(self, doc_id, field, value): """Adds a list of values to a set. If the field does not exist yet, it will be created. The value should be a list. :param doc_id: document id :param field: field :param value: list, a value to be appended to the current list """ try: self.__collection.update( {Mongo.ID_FIELD: self.__escape(doc_id)}, {'$addToSet': { self.__escape(field): { '$each': value } }}, upsert=True) except Exception as e: PLOGGER.error("\nError (doc_id: " + str(doc_id) + "), field: " + field + "\n" + str(e))
def __check_config(config): """Checks params and set default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_OPERATION not in config: config[KEY_OPERATION] = KEY_APPEND if KEY_PATH not in config: raise Exception(KEY_PATH + " is missing") if KEY_FILES not in config: raise Exception(KEY_FILES + " is missing") # reads all files existing_files = set() for subdir, dir, files in os.walk(config[KEY_PATH]): for file in files: existing_files.add(os.path.join(subdir, file)) for file in config[KEY_FILES]: dbpedia_file = config[KEY_PATH] + file[KEY_FILE_NAME] if dbpedia_file not in existing_files: raise Exception(dbpedia_file + " does not exist.") except Exception as e: PLOGGER.error("Error in config file: ", e) sys.exit(1)