def get_categories(self, job_url): categories_list = [] # query the tbl_category table query = "SELECT * FROM tbl_category WHERE job_url=\'" + job_url + "\'" all_categories = self.sqlite_reader.runQuery(query) for row in all_categories: cat = row[2] # split on a hyphen, which is used by jobs.bg to denote # subcategories, e.g. ИТ - Административни дейности и продажби cat_splitted = self.__split_dashes(cat) if len(cat_splitted) == 2: leaf = sanitize_id(cat_splitted[1].encode("utf-8")) parent = sanitize_id(cat_splitted[0].encode("utf-8")) identifier = parent + leaf most_similar = self.tp.get_most_similar(identifier) # try to find leaf in the category tree # if identifier in self.tp: if most_similar: # good, category is resolved categories_list.append(most_similar) JobCategorizer.logger.debug("Category %s fully resolved for job %s", cat, job_url) else: # leaf not found, likely misspecified category config. JobCategorizer.logger.warning(("Mismatch in hierarchical structures " "between database and category file " "for category %s, job %s"), cat, job_url) elif len(cat_splitted) == 1: # category is not hyphenated, find it in the tree as it is leaf = sanitize_id(cat.encode("utf-8")) most_similar = self.tp.get_most_similar(leaf) if most_similar in self.tp: # found it, we are done categories_list.append(most_similar) JobCategorizer.logger.debug(("Category %s fully resolved for " "job %s"), cat, job_url) else: # not found, likely misspecified category config. JobCategorizer.logger.warning(("Mismatch in hierarchical structures " "between database and category file " "for category %s, job %s"), cat, job_url) else: # should never happen JobCategorizer.logger.error("Impossible category %s. Investigate.", cat) # end for row in all_categories return categories_list
def build_tree(self): csv = self.__read_csv_description() if not csv: return # create the ROOT node self.add_node("ROOT", alternative_name="ROOT", identifier="root") for row in csv: last_scanned = None # each n in row should be a tree node for idx, node in enumerate(row): # example: Технологии (Четвъртичен сектор) # (node_name, alternative_name)=(Технологии,Четвъртичен сектор) (node_name, alternative_name) = self.__tokenize_nodename(node) # row[0] should always have ROOT as parent parent = "root" if idx == 0 else last_scanned unique_id = node_name if parent == "root" else parent + node_name self.add_node(node_name, alternative_name, identifier=sanitize_id(unique_id), parent_identifier=parent) last_scanned = sanitize_id(unique_id) # self.initialized = True