示例#1
0
    def load_doc_neighborhood_graph(
            self,
            nodes,
            graph_path=None,
            get_stats: bool = config["graph"]["stats"]):
        """ Loads the graph file if found else creates neighborhood graph.

        :param nodes: List of node ids to consider.
        :param get_stats:
        :param graph_path: Full path to the graphml file.
        :return: Networkx graph, Adjecency matrix, stats related to the graph.
        """

        if graph_path is None:
            graph_path = join(
                self.graph_dir, self.dataset_name,
                self.dataset_name + "_G_" + str(len(nodes)) + ".graphml")
        if exists(graph_path):
            logger.info(
                "Loading neighborhood graph from [{0}]".format(graph_path))
            Docs_G = nx.read_graphml(graph_path)
        else:
            self.sample2cats = File_Util.load_json(
                join(self.graph_dir, self.dataset_name,
                     self.dataset_name + "_sample2cats"))
            self.categories = File_Util.load_json(
                join(self.graph_dir, self.dataset_name,
                     self.dataset_name + "_cats"))
            self.cat_id2text_map = File_Util.load_json(
                join(self.graph_dir, self.dataset_name,
                     self.dataset_name + "_catid2cattxt_map"))
            Docs_G = self.create_neighborhood_graph(nodes=nodes)
            logger.debug(nx.info(Docs_G))
            logger.info(
                "Saving neighborhood graph at [{0}]".format(graph_path))
            nx.write_graphml(Docs_G, graph_path)
        # Docs_adj = nx.adjacency_matrix(Docs_G)
        if get_stats:
            Docs_G_stats = self.graph_stats(Docs_G)
            File_Util.save_json(Docs_G_stats,
                                filename=self.dataset_name + "_G_stats",
                                overwrite=True,
                                filepath=join(self.graph_dir,
                                              self.dataset_name))
            return Docs_G, Docs_G_stats
        return Docs_G
示例#2
0
    def calculate_idf_per_token(self,txts: list,subtract: int = 1) -> dict:
        """ Calculates tfidf scores for each token in the corpus.

        :param txts:
        :param subtract: Removes this value from idf scores. Sometimes needed to get better scores.
        :return: Dict of token to idf score.
        """
        logger.info("Calculating IDF for each token.")
        if isfile(join(self.dataset_dir,self.dataset_name + "_tfidf_dict.json")):
            idf_dict = File_Util.load_json(filename=self.dataset_name + "_idf_dict",filepath=self.dataset_dir)
        else:
            from sklearn.feature_extraction.text import TfidfVectorizer
            ## Using TfidfVectorizer with spacy tokenizer; same tokenizer should be used everywhere.
            vectorizer = TfidfVectorizer(decode_error='ignore',lowercase=False,smooth_idf=False, sublinear_tf=True, stop_words='english', ngram_range=(1,1), max_df=0.7, vocabulary=None,
                                         tokenizer=self.tokenizer_spacy)
            tfidf_matrix = vectorizer.fit_transform(txts)
            idf = vectorizer.idf_
            idf_dict = dict(zip(vectorizer.get_feature_names(),idf - subtract))  ## Subtract 1 from idf to get better scores
            ignored_tokens = vectorizer.stop_words_

            File_Util.save_json(idf_dict,filename=self.dataset_name + "_idf_dict",filepath=self.dataset_dir)

        return idf_dict
示例#3
0
    def gen_dicts(self):
        """Filters txts, sample2cats and cattext2catid_map from wikipedia text.

        :return: Dict of txts, sample2cats and cattext2catid_map filtered from samples.
        """

        if isdir(self.raw_txt_dir):
            logger.info("Loading data from TXT files.")
            self.samples = self.read_txt_dir(self.raw_txt_dir)
        else:
            logger.info("Could not find TXT files: [{}]".format(self.raw_txt_dir))
            logger.info("Loading data from HTML files.")
            html_parser = self.get_html_parser()
            self.samples = self.read_html_dir(html_parser)

        classes = OrderedDict()
        hid_classes = OrderedDict()
        cats = OrderedDict()
        hid_cats = OrderedDict()
        txts = OrderedDict()
        cat_idx = 0
        hid_cat_idx = 0
        no_cat_ids = []  # List to store failed parsing cases.
        for doc_id,txt in self.samples.items():
            txt = list(filter(None,txt))  # Removing empty items
            doc,filtered_cats,filtered_hid_cats = self.clean.filter_html_cats_reverse(txt)
            ## assert filtered_cats, "No category information was found for doc_id: [{0}].".format(doc_id)
            if filtered_cats:  ## Check at least one category was successfully filtered from html file.
                txts[doc_id] = clean_wiki(doc)  ## Removing category information and other texts from html pages.
                for lbl in filtered_cats:
                    if lbl not in cats:  ## If lbl does not exists in cats already, add it and assign a
                        ## new category index.
                        cats[lbl] = cat_idx
                        cat_idx += 1
                    if doc_id in classes:  ## Check if doc_id exists, append if yes.
                        classes[doc_id].append(cats[lbl])
                    else:  ## Create entry for doc_id if does not exist.
                        classes[doc_id] = [cats[lbl]]
            else:  ## If no category was found, store the doc_id in a separate place for later inspection.
                logger.warn("No categories found in document: [{}].".format(doc_id))
                no_cat_ids.append(doc_id)

            ## Shall we use hidden category information?
            if filtered_hid_cats:  ## Check at least one hidden category was successfully filtered from html file.
                for lbl in filtered_hid_cats:
                    if lbl not in hid_cats:  ## If lbl does not exists in hid_cats already, add it and
                        ## assign a new hid_category index.
                        hid_cats[lbl] = hid_cat_idx
                        hid_cat_idx += 1
                    if doc_id in hid_classes:  ## Check if doc_id exists, append if yes.
                        hid_classes[doc_id].append(hid_cats[lbl])
                    else:  ## Create entry for doc_id if does not exist.
                        hid_classes[doc_id] = [hid_cats[lbl]]
        logger.warn("No cattext2catid_map found for: [{}] documents. Storing ids for reference in file '_no_cat_ids'."
                    .format(len(no_cat_ids)))
        File_Util.save_json(hid_classes,self.dataset_name + "_hid_classes",filepath=self.dataset_dir)
        File_Util.save_json(hid_cats,self.dataset_name + "_hid_cats",filepath=self.dataset_dir)
        File_Util.save_json(no_cat_ids,self.dataset_name + "_no_cat_ids",filepath=self.dataset_dir)
        logger.info("Number of txts: [{}], sample2cats: [{}] and cattext2catid_map: [{}]."
                    .format(len(txts),len(classes),len(cats)))
        return txts,classes,cats
示例#4
0
    def gen_dicts(self,
                  json_path=None,
                  encoding=config["text_process"]["encoding"],
                  specials="""_-@*#'"/\\""",
                  replace=' '):
        """
        Generates the data dictionaries from original json file.

        :param replace: Character to replace with.
        :param specials: Characters to clean from txts.
        :param json_path: Path to raw json file.
        :param encoding: Encoding for the raw json file.
        :return: txts, sample2cats, cattext2catid_map, no_cat_ids
            no_cat_ids: ids for which no categories were found.
        """
        import ast  # As the data is not proper JSON (single-quote instead of double-quote) format, "json" library will not work.
        from unidecode import unidecode

        logger.info("Generates the data dictionaries from original json file.")
        txts = OrderedDict()
        classes = OrderedDict()
        cats = OrderedDict()
        no_cat_ids = []  # To store ids for which no cats were found.

        if json_path is None: json_path = self.raw_json_dir
        with sopen(json_path, encoding=encoding) as raw_json_ptr:
            trans_table = File_Util.make_trans_table(
                specials=specials,
                replace=replace)  # Creating mapping to clean txts.
            cat_idx = 0  # Holds the category index.
            for cnt, line in enumerate(raw_json_ptr):
                # Instead of: line_dict = OrderedDict(json.loads(line));
                # Use: import ast; line_dict = ast.literal_eval(line.strip().replace('\n','\\n'));
                line_dict = ast.literal_eval(line.strip().replace('\n', '\\n'))
                if "categories" in line_dict:  # Check if "cats" exists.
                    if "title" in line_dict:  # Check if "title" exists, add if True.
                        txts[line_dict["asin"]] = unidecode(
                            str(line_dict["title"])).translate(trans_table)
                        if "description" in line_dict:  # Check if "description" exists and append to "title" with keyword: ". \nDESC: ", if true.
                            txts[line_dict["asin"]] = txts[
                                line_dict["asin"]] + ". \nDESC: " + unidecode(
                                    str(line_dict["description"])).translate(
                                        trans_table)
                    else:
                        if "description" in line_dict:  # Check if "description" exists even though "title" does not, use only "description" if true.
                            txts[line_dict["asin"]] = ". \nDESC: " + line_dict[
                                "description"]
                        else:  # Report and skip the sample if neither "title" nor "description" exists.
                            logger.warning(
                                "Neither 'title' nor 'description' found for sample id: [{}]. Adding sample to 'no_cat_ids'."
                                .format(line_dict["asin"]))
                            no_cat_ids.append(
                                line_dict["asin"]
                            )  # As neither "title" nor "description" exists, adding the id to "no_cat_ids".
                            continue
                    classes[line_dict["asin"]] = line_dict["cats"][0]
                    for lbl in classes[line_dict["asin"]]:
                        if lbl not in cats:  # If lbl does not exists in cats already, add it and assign a new category index.
                            cats[lbl] = cat_idx
                            cat_idx += 1
                        classes[line_dict["asin"]][classes[
                            line_dict["asin"]].index(lbl)] = cats[
                                lbl]  # Replacing cats text to cats id.
                else:  # if "categories" does not exist, then add the id to "no_cat_ids".
                    no_cat_ids.append(line_dict["asin"])

        File_Util.save_json(no_cat_ids,
                            self.dataset_name + "_no_cat_ids",
                            filepath=self.dataset_dir)
        logger.info(
            "Number of txts: [{}], sample2cats: [{}] and cattext2catid_map: [{}]."
            .format(len(txts), len(classes), len(cats)))
        return txts, classes, cats
    def create_new_data(self,
                        new_data_name: str = "_pointer",
                        save_files: bool = True,
                        save_dir: str = None,
                        catid2cattxt_map: OrderedDict = None):
        """Creates new dataset based on new_data_name value, currently supports: "_fixed5" and "_onehot".

        _fixed5: Creates a dataset of samples which belongs to any of the below 5 sample2cats only.
        _onehot: Creates a dataset which belongs to single class only.

        NOTE: This method is used only for sanity testing using fixed multi-class scenario.
        """
        if save_dir is None:
            save_dir = join(self.dataset_dir,
                            self.dataset_name + new_data_name)
        if isfile(
                join(save_dir, self.dataset_name + new_data_name +
                     "_sample2cats.json")) and isfile(
                         join(save_dir, self.dataset_name + new_data_name +
                              "_txts.json")) and isfile(
                                  join(
                                      save_dir, self.dataset_name +
                                      new_data_name + "_cats.json")):
            logger.info("Loading files from: [{}]".format(save_dir))
            txts_new = File_Util.load_json(self.dataset_name + new_data_name +
                                           "_txts",
                                           filepath=save_dir)
            sample2cats_new = File_Util.load_json(
                self.dataset_name + new_data_name + "_sample2cats",
                filepath=save_dir)
            cats_new = File_Util.load_json(self.dataset_name + new_data_name +
                                           "_cats",
                                           filepath=save_dir)
        else:
            logger.info(
                "No existing files found at [{}]. Generating {} files.".format(
                    save_dir, new_data_name))
            if catid2cattxt_map is None:                catid2cattxt_map =\
File_Util.load_json(self.dataset_name + "_catid2cattxt_map",
                filepath=self.dataset_dir)

            txts, classes, _ = self.load_full_json(return_values=True)
            if new_data_name is "_fixed5":
                txts_one, classes_one, _ = self._create_oneclass_data(
                    txts, classes, catid2cattxt_map=catid2cattxt_map)
                txts_new,sample2cats_new,cats_new =\
                    self._create_fixed_cat_data(txts_one,classes_one,
                                                catid2cattxt_map=catid2cattxt_map)
            elif new_data_name is "_onehot":
                txts_new,sample2cats_new,cats_new =\
                    self._create_oneclass_data(txts,classes,
                                               catid2cattxt_map=catid2cattxt_map)
            elif new_data_name is "_pointer":
                txts_new,sample2cats_new,cats_new =\
                    self._create_pointer_data(txts,classes,
                                              catid2cattxt_map=catid2cattxt_map)
            elif new_data_name is "_fewshot":
                txts_new,sample2cats_new,cats_new =\
                    self._create_fewshot_data(txts,classes,
                                              catid2cattxt_map=catid2cattxt_map)
            elif new_data_name is "_firstsent":
                txts_new,sample2cats_new,cats_new =\
                    self._create_firstsent_data(txts,classes,
                                                catid2cattxt_map=catid2cattxt_map)
            else:
                raise Exception(
                    "Unknown 'new_data_name': [{}]. \n Available options: ['_fixed5','_onehot', '_pointer']"
                    .format(new_data_name))
            if save_files:  # Storing new data
                logger.info(
                    "New dataset will be stored inside original dataset directory at: [{}]"
                    .format(save_dir))
                makedirs(save_dir, exist_ok=True)
                File_Util.save_json(txts_new,
                                    self.dataset_name + new_data_name +
                                    "_txts",
                                    filepath=save_dir)
                File_Util.save_json(sample2cats_new,
                                    self.dataset_name + new_data_name +
                                    "_sample2cats",
                                    filepath=save_dir)
                File_Util.save_json(cats_new,
                                    self.dataset_name + new_data_name +
                                    "_cats",
                                    filepath=save_dir)

        return txts_new, sample2cats_new, cats_new
    def load_full_json(self, return_values: bool = False):
        """
        Loads full dataset and splits the data into train, val and test.
        """
        if isfile(join(self.dataset_dir,self.dataset_name + "_txts.json"))\
                and isfile(
            join(self.dataset_dir,self.dataset_name + "_sample2cats.json"))\
                and isfile(
            join(self.dataset_dir,self.dataset_name + "_cats.json")):
            logger.info("Loading pre-processed json files from: [{}]".format(
                join(self.dataset_dir, self.dataset_name + "_txts.json")))
            txts = File_Util.load_json(self.dataset_name + "_txts",
                                       filepath=self.dataset_dir,
                                       show_path=True)
            classes = File_Util.load_json(self.dataset_name + "_sample2cats",
                                          filepath=self.dataset_dir,
                                          show_path=True)
            categories = File_Util.load_json(self.dataset_name + "_cats",
                                             filepath=self.dataset_dir,
                                             show_path=True)
            assert len(txts) == len(classes),\
                "Count of txts [{0}] and sample2cats [{1}] should match.".format(
                    len(txts),len(classes))
        else:
            logger.warn("Pre-processed json files not found at: [{}]".format(
                join(self.dataset_dir, self.dataset_name + "_txts.json")))
            logger.info(
                "Loading raw data and creating 3 separate dicts of txts [id->texts], sample2cats [id->class_ids]"
                " and categories [class_name : class_id].")
            txts, classes, categories = self.load_raw_data(self.dataset_type)
            File_Util.save_json(categories,
                                self.dataset_name + "_cats",
                                filepath=self.dataset_dir)
            File_Util.save_json(txts,
                                self.dataset_name + "_txts",
                                filepath=self.dataset_dir)
            File_Util.save_json(classes,
                                self.dataset_name + "_sample2cats",
                                filepath=self.dataset_dir)
            logger.info("Cleaning categories.")
            categories, categories_dup_dict, dup_cat_text_map = self.clean.clean_categories(
                categories)
            File_Util.save_json(dup_cat_text_map,
                                self.dataset_name + "_dup_cat_text_map",
                                filepath=self.dataset_dir,
                                overwrite=True)
            File_Util.save_json(categories,
                                self.dataset_name + "_cats",
                                filepath=self.dataset_dir,
                                overwrite=True)
            if categories_dup_dict:  # Replace old category ids with new ids if duplicate categories found.
                File_Util.save_json(
                    categories_dup_dict,
                    self.dataset_name + "_categories_dup_dict",
                    filepath=self.dataset_dir,
                    overwrite=True
                )  # Storing the duplicate categories for future dedup removal.
                classes = self.clean.dedup_data(classes, categories_dup_dict)
            assert len(txts) == len(classes),\
                "Count of txts [{0}] and sample2cats [{1}] should match.".format(
                    len(txts),len(classes))
            File_Util.save_json(txts,
                                self.dataset_name + "_txts",
                                filepath=self.dataset_dir,
                                overwrite=True)
            File_Util.save_json(classes,
                                self.dataset_name + "_sample2cats",
                                filepath=self.dataset_dir,
                                overwrite=True)
            logger.info(
                "Saved txts [{0}], sample2cats [{1}] and categories [{2}] as json files."
                .format(join(self.dataset_dir + "_txts.json"),
                        join(self.dataset_dir + "_sample2cats.json"),
                        join(self.dataset_dir + "_cats.json")))
        if return_values:
            return txts, classes, categories
        else:
            # Splitting data into train, validation and test sets.
            self.txts_train,self.sample2cats_train,self.cats_sel,self.txts_val,self.sample2cats_val,\
            self.cats_val,self.txts_test,self.sample2cats_test,self.cats_test,catid2cattxt_map =\
                self.split_data(txts=txts,classes=classes,categories=categories)
            txts, classes, categories = None, None, None  # Remove large dicts and free up memory.
            collect()

            File_Util.save_json(self.txts_test,
                                self.dataset_name + "_txts_test",
                                filepath=self.dataset_dir)
            File_Util.save_json(self.sample2cats_test,
                                self.dataset_name + "_sample2cats_test",
                                filepath=self.dataset_dir)
            File_Util.save_json(self.txts_val,
                                self.dataset_name + "_txts_val",
                                filepath=self.dataset_dir)
            File_Util.save_json(self.sample2cats_val,
                                self.dataset_name + "_sample2cats_val",
                                filepath=self.dataset_dir)
            File_Util.save_json(self.txts_train,
                                self.dataset_name + "_txts_train",
                                filepath=self.dataset_dir)
            File_Util.save_json(self.sample2cats_train,
                                self.dataset_name + "_sample2cats_train",
                                filepath=self.dataset_dir)
            File_Util.save_json(self.cats_sel,
                                self.dataset_name + "_cats_train",
                                filepath=self.dataset_dir)
            File_Util.save_json(self.cats_val,
                                self.dataset_name + "_cats_val",
                                filepath=self.dataset_dir)
            File_Util.save_json(self.cats_test,
                                self.dataset_name + "_cats_test",
                                filepath=self.dataset_dir)
            File_Util.save_json(catid2cattxt_map,
                                self.dataset_name + "_catid2cattxt_map",
                                filepath=self.dataset_dir)
            return self.txts_train,self.sample2cats_train,self.cats_sel,self.txts_val,self.sample2cats_val,\
                   self.cats_val,self.txts_test,self.sample2cats_test,self.cats_test