def load_doc_neighborhood_graph( self, nodes, graph_path=None, get_stats: bool = config["graph"]["stats"]): """ Loads the graph file if found else creates neighborhood graph. :param nodes: List of node ids to consider. :param get_stats: :param graph_path: Full path to the graphml file. :return: Networkx graph, Adjecency matrix, stats related to the graph. """ if graph_path is None: graph_path = join( self.graph_dir, self.dataset_name, self.dataset_name + "_G_" + str(len(nodes)) + ".graphml") if exists(graph_path): logger.info( "Loading neighborhood graph from [{0}]".format(graph_path)) Docs_G = nx.read_graphml(graph_path) else: self.sample2cats = File_Util.load_json( join(self.graph_dir, self.dataset_name, self.dataset_name + "_sample2cats")) self.categories = File_Util.load_json( join(self.graph_dir, self.dataset_name, self.dataset_name + "_cats")) self.cat_id2text_map = File_Util.load_json( join(self.graph_dir, self.dataset_name, self.dataset_name + "_catid2cattxt_map")) Docs_G = self.create_neighborhood_graph(nodes=nodes) logger.debug(nx.info(Docs_G)) logger.info( "Saving neighborhood graph at [{0}]".format(graph_path)) nx.write_graphml(Docs_G, graph_path) # Docs_adj = nx.adjacency_matrix(Docs_G) if get_stats: Docs_G_stats = self.graph_stats(Docs_G) File_Util.save_json(Docs_G_stats, filename=self.dataset_name + "_G_stats", overwrite=True, filepath=join(self.graph_dir, self.dataset_name)) return Docs_G, Docs_G_stats return Docs_G
def calculate_idf_per_token(self,txts: list,subtract: int = 1) -> dict: """ Calculates tfidf scores for each token in the corpus. :param txts: :param subtract: Removes this value from idf scores. Sometimes needed to get better scores. :return: Dict of token to idf score. """ logger.info("Calculating IDF for each token.") if isfile(join(self.dataset_dir,self.dataset_name + "_tfidf_dict.json")): idf_dict = File_Util.load_json(filename=self.dataset_name + "_idf_dict",filepath=self.dataset_dir) else: from sklearn.feature_extraction.text import TfidfVectorizer ## Using TfidfVectorizer with spacy tokenizer; same tokenizer should be used everywhere. vectorizer = TfidfVectorizer(decode_error='ignore',lowercase=False,smooth_idf=False, sublinear_tf=True, stop_words='english', ngram_range=(1,1), max_df=0.7, vocabulary=None, tokenizer=self.tokenizer_spacy) tfidf_matrix = vectorizer.fit_transform(txts) idf = vectorizer.idf_ idf_dict = dict(zip(vectorizer.get_feature_names(),idf - subtract)) ## Subtract 1 from idf to get better scores ignored_tokens = vectorizer.stop_words_ File_Util.save_json(idf_dict,filename=self.dataset_name + "_idf_dict",filepath=self.dataset_dir) return idf_dict
def gen_dicts(self): """Filters txts, sample2cats and cattext2catid_map from wikipedia text. :return: Dict of txts, sample2cats and cattext2catid_map filtered from samples. """ if isdir(self.raw_txt_dir): logger.info("Loading data from TXT files.") self.samples = self.read_txt_dir(self.raw_txt_dir) else: logger.info("Could not find TXT files: [{}]".format(self.raw_txt_dir)) logger.info("Loading data from HTML files.") html_parser = self.get_html_parser() self.samples = self.read_html_dir(html_parser) classes = OrderedDict() hid_classes = OrderedDict() cats = OrderedDict() hid_cats = OrderedDict() txts = OrderedDict() cat_idx = 0 hid_cat_idx = 0 no_cat_ids = [] # List to store failed parsing cases. for doc_id,txt in self.samples.items(): txt = list(filter(None,txt)) # Removing empty items doc,filtered_cats,filtered_hid_cats = self.clean.filter_html_cats_reverse(txt) ## assert filtered_cats, "No category information was found for doc_id: [{0}].".format(doc_id) if filtered_cats: ## Check at least one category was successfully filtered from html file. txts[doc_id] = clean_wiki(doc) ## Removing category information and other texts from html pages. for lbl in filtered_cats: if lbl not in cats: ## If lbl does not exists in cats already, add it and assign a ## new category index. cats[lbl] = cat_idx cat_idx += 1 if doc_id in classes: ## Check if doc_id exists, append if yes. classes[doc_id].append(cats[lbl]) else: ## Create entry for doc_id if does not exist. classes[doc_id] = [cats[lbl]] else: ## If no category was found, store the doc_id in a separate place for later inspection. logger.warn("No categories found in document: [{}].".format(doc_id)) no_cat_ids.append(doc_id) ## Shall we use hidden category information? if filtered_hid_cats: ## Check at least one hidden category was successfully filtered from html file. for lbl in filtered_hid_cats: if lbl not in hid_cats: ## If lbl does not exists in hid_cats already, add it and ## assign a new hid_category index. hid_cats[lbl] = hid_cat_idx hid_cat_idx += 1 if doc_id in hid_classes: ## Check if doc_id exists, append if yes. hid_classes[doc_id].append(hid_cats[lbl]) else: ## Create entry for doc_id if does not exist. hid_classes[doc_id] = [hid_cats[lbl]] logger.warn("No cattext2catid_map found for: [{}] documents. Storing ids for reference in file '_no_cat_ids'." .format(len(no_cat_ids))) File_Util.save_json(hid_classes,self.dataset_name + "_hid_classes",filepath=self.dataset_dir) File_Util.save_json(hid_cats,self.dataset_name + "_hid_cats",filepath=self.dataset_dir) File_Util.save_json(no_cat_ids,self.dataset_name + "_no_cat_ids",filepath=self.dataset_dir) logger.info("Number of txts: [{}], sample2cats: [{}] and cattext2catid_map: [{}]." .format(len(txts),len(classes),len(cats))) return txts,classes,cats
def gen_dicts(self, json_path=None, encoding=config["text_process"]["encoding"], specials="""_-@*#'"/\\""", replace=' '): """ Generates the data dictionaries from original json file. :param replace: Character to replace with. :param specials: Characters to clean from txts. :param json_path: Path to raw json file. :param encoding: Encoding for the raw json file. :return: txts, sample2cats, cattext2catid_map, no_cat_ids no_cat_ids: ids for which no categories were found. """ import ast # As the data is not proper JSON (single-quote instead of double-quote) format, "json" library will not work. from unidecode import unidecode logger.info("Generates the data dictionaries from original json file.") txts = OrderedDict() classes = OrderedDict() cats = OrderedDict() no_cat_ids = [] # To store ids for which no cats were found. if json_path is None: json_path = self.raw_json_dir with sopen(json_path, encoding=encoding) as raw_json_ptr: trans_table = File_Util.make_trans_table( specials=specials, replace=replace) # Creating mapping to clean txts. cat_idx = 0 # Holds the category index. for cnt, line in enumerate(raw_json_ptr): # Instead of: line_dict = OrderedDict(json.loads(line)); # Use: import ast; line_dict = ast.literal_eval(line.strip().replace('\n','\\n')); line_dict = ast.literal_eval(line.strip().replace('\n', '\\n')) if "categories" in line_dict: # Check if "cats" exists. if "title" in line_dict: # Check if "title" exists, add if True. txts[line_dict["asin"]] = unidecode( str(line_dict["title"])).translate(trans_table) if "description" in line_dict: # Check if "description" exists and append to "title" with keyword: ". \nDESC: ", if true. txts[line_dict["asin"]] = txts[ line_dict["asin"]] + ". \nDESC: " + unidecode( str(line_dict["description"])).translate( trans_table) else: if "description" in line_dict: # Check if "description" exists even though "title" does not, use only "description" if true. txts[line_dict["asin"]] = ". \nDESC: " + line_dict[ "description"] else: # Report and skip the sample if neither "title" nor "description" exists. logger.warning( "Neither 'title' nor 'description' found for sample id: [{}]. Adding sample to 'no_cat_ids'." .format(line_dict["asin"])) no_cat_ids.append( line_dict["asin"] ) # As neither "title" nor "description" exists, adding the id to "no_cat_ids". continue classes[line_dict["asin"]] = line_dict["cats"][0] for lbl in classes[line_dict["asin"]]: if lbl not in cats: # If lbl does not exists in cats already, add it and assign a new category index. cats[lbl] = cat_idx cat_idx += 1 classes[line_dict["asin"]][classes[ line_dict["asin"]].index(lbl)] = cats[ lbl] # Replacing cats text to cats id. else: # if "categories" does not exist, then add the id to "no_cat_ids". no_cat_ids.append(line_dict["asin"]) File_Util.save_json(no_cat_ids, self.dataset_name + "_no_cat_ids", filepath=self.dataset_dir) logger.info( "Number of txts: [{}], sample2cats: [{}] and cattext2catid_map: [{}]." .format(len(txts), len(classes), len(cats))) return txts, classes, cats
def create_new_data(self, new_data_name: str = "_pointer", save_files: bool = True, save_dir: str = None, catid2cattxt_map: OrderedDict = None): """Creates new dataset based on new_data_name value, currently supports: "_fixed5" and "_onehot". _fixed5: Creates a dataset of samples which belongs to any of the below 5 sample2cats only. _onehot: Creates a dataset which belongs to single class only. NOTE: This method is used only for sanity testing using fixed multi-class scenario. """ if save_dir is None: save_dir = join(self.dataset_dir, self.dataset_name + new_data_name) if isfile( join(save_dir, self.dataset_name + new_data_name + "_sample2cats.json")) and isfile( join(save_dir, self.dataset_name + new_data_name + "_txts.json")) and isfile( join( save_dir, self.dataset_name + new_data_name + "_cats.json")): logger.info("Loading files from: [{}]".format(save_dir)) txts_new = File_Util.load_json(self.dataset_name + new_data_name + "_txts", filepath=save_dir) sample2cats_new = File_Util.load_json( self.dataset_name + new_data_name + "_sample2cats", filepath=save_dir) cats_new = File_Util.load_json(self.dataset_name + new_data_name + "_cats", filepath=save_dir) else: logger.info( "No existing files found at [{}]. Generating {} files.".format( save_dir, new_data_name)) if catid2cattxt_map is None: catid2cattxt_map =\ File_Util.load_json(self.dataset_name + "_catid2cattxt_map", filepath=self.dataset_dir) txts, classes, _ = self.load_full_json(return_values=True) if new_data_name is "_fixed5": txts_one, classes_one, _ = self._create_oneclass_data( txts, classes, catid2cattxt_map=catid2cattxt_map) txts_new,sample2cats_new,cats_new =\ self._create_fixed_cat_data(txts_one,classes_one, catid2cattxt_map=catid2cattxt_map) elif new_data_name is "_onehot": txts_new,sample2cats_new,cats_new =\ self._create_oneclass_data(txts,classes, catid2cattxt_map=catid2cattxt_map) elif new_data_name is "_pointer": txts_new,sample2cats_new,cats_new =\ self._create_pointer_data(txts,classes, catid2cattxt_map=catid2cattxt_map) elif new_data_name is "_fewshot": txts_new,sample2cats_new,cats_new =\ self._create_fewshot_data(txts,classes, catid2cattxt_map=catid2cattxt_map) elif new_data_name is "_firstsent": txts_new,sample2cats_new,cats_new =\ self._create_firstsent_data(txts,classes, catid2cattxt_map=catid2cattxt_map) else: raise Exception( "Unknown 'new_data_name': [{}]. \n Available options: ['_fixed5','_onehot', '_pointer']" .format(new_data_name)) if save_files: # Storing new data logger.info( "New dataset will be stored inside original dataset directory at: [{}]" .format(save_dir)) makedirs(save_dir, exist_ok=True) File_Util.save_json(txts_new, self.dataset_name + new_data_name + "_txts", filepath=save_dir) File_Util.save_json(sample2cats_new, self.dataset_name + new_data_name + "_sample2cats", filepath=save_dir) File_Util.save_json(cats_new, self.dataset_name + new_data_name + "_cats", filepath=save_dir) return txts_new, sample2cats_new, cats_new
def load_full_json(self, return_values: bool = False): """ Loads full dataset and splits the data into train, val and test. """ if isfile(join(self.dataset_dir,self.dataset_name + "_txts.json"))\ and isfile( join(self.dataset_dir,self.dataset_name + "_sample2cats.json"))\ and isfile( join(self.dataset_dir,self.dataset_name + "_cats.json")): logger.info("Loading pre-processed json files from: [{}]".format( join(self.dataset_dir, self.dataset_name + "_txts.json"))) txts = File_Util.load_json(self.dataset_name + "_txts", filepath=self.dataset_dir, show_path=True) classes = File_Util.load_json(self.dataset_name + "_sample2cats", filepath=self.dataset_dir, show_path=True) categories = File_Util.load_json(self.dataset_name + "_cats", filepath=self.dataset_dir, show_path=True) assert len(txts) == len(classes),\ "Count of txts [{0}] and sample2cats [{1}] should match.".format( len(txts),len(classes)) else: logger.warn("Pre-processed json files not found at: [{}]".format( join(self.dataset_dir, self.dataset_name + "_txts.json"))) logger.info( "Loading raw data and creating 3 separate dicts of txts [id->texts], sample2cats [id->class_ids]" " and categories [class_name : class_id].") txts, classes, categories = self.load_raw_data(self.dataset_type) File_Util.save_json(categories, self.dataset_name + "_cats", filepath=self.dataset_dir) File_Util.save_json(txts, self.dataset_name + "_txts", filepath=self.dataset_dir) File_Util.save_json(classes, self.dataset_name + "_sample2cats", filepath=self.dataset_dir) logger.info("Cleaning categories.") categories, categories_dup_dict, dup_cat_text_map = self.clean.clean_categories( categories) File_Util.save_json(dup_cat_text_map, self.dataset_name + "_dup_cat_text_map", filepath=self.dataset_dir, overwrite=True) File_Util.save_json(categories, self.dataset_name + "_cats", filepath=self.dataset_dir, overwrite=True) if categories_dup_dict: # Replace old category ids with new ids if duplicate categories found. File_Util.save_json( categories_dup_dict, self.dataset_name + "_categories_dup_dict", filepath=self.dataset_dir, overwrite=True ) # Storing the duplicate categories for future dedup removal. classes = self.clean.dedup_data(classes, categories_dup_dict) assert len(txts) == len(classes),\ "Count of txts [{0}] and sample2cats [{1}] should match.".format( len(txts),len(classes)) File_Util.save_json(txts, self.dataset_name + "_txts", filepath=self.dataset_dir, overwrite=True) File_Util.save_json(classes, self.dataset_name + "_sample2cats", filepath=self.dataset_dir, overwrite=True) logger.info( "Saved txts [{0}], sample2cats [{1}] and categories [{2}] as json files." .format(join(self.dataset_dir + "_txts.json"), join(self.dataset_dir + "_sample2cats.json"), join(self.dataset_dir + "_cats.json"))) if return_values: return txts, classes, categories else: # Splitting data into train, validation and test sets. self.txts_train,self.sample2cats_train,self.cats_sel,self.txts_val,self.sample2cats_val,\ self.cats_val,self.txts_test,self.sample2cats_test,self.cats_test,catid2cattxt_map =\ self.split_data(txts=txts,classes=classes,categories=categories) txts, classes, categories = None, None, None # Remove large dicts and free up memory. collect() File_Util.save_json(self.txts_test, self.dataset_name + "_txts_test", filepath=self.dataset_dir) File_Util.save_json(self.sample2cats_test, self.dataset_name + "_sample2cats_test", filepath=self.dataset_dir) File_Util.save_json(self.txts_val, self.dataset_name + "_txts_val", filepath=self.dataset_dir) File_Util.save_json(self.sample2cats_val, self.dataset_name + "_sample2cats_val", filepath=self.dataset_dir) File_Util.save_json(self.txts_train, self.dataset_name + "_txts_train", filepath=self.dataset_dir) File_Util.save_json(self.sample2cats_train, self.dataset_name + "_sample2cats_train", filepath=self.dataset_dir) File_Util.save_json(self.cats_sel, self.dataset_name + "_cats_train", filepath=self.dataset_dir) File_Util.save_json(self.cats_val, self.dataset_name + "_cats_val", filepath=self.dataset_dir) File_Util.save_json(self.cats_test, self.dataset_name + "_cats_test", filepath=self.dataset_dir) File_Util.save_json(catid2cattxt_map, self.dataset_name + "_catid2cattxt_map", filepath=self.dataset_dir) return self.txts_train,self.sample2cats_train,self.cats_sel,self.txts_val,self.sample2cats_val,\ self.cats_val,self.txts_test,self.sample2cats_test,self.cats_test