示例#1
0
 def save_vocabs_and_config(self,
                            idx2label_path=None,
                            idx2cls_path=None,
                            config_path=None):
     idx2label_path = if_none(idx2label_path, self.idx2label_path)
     idx2cls_path = if_none(idx2cls_path, self.idx2cls_path)
     config_path = if_none(config_path, self.config_path)
     logging.info("Saving vocabs...")
     save_json(self.idx2label, idx2label_path)
     if self.idx2cls:
         save_json(self.idx2cls, idx2cls_path)
     save_json(self.get_config(), config_path)
示例#2
0
    def load(self, df_path=None, df=None):
        df_path = if_none(df_path, self.config["df_path"])
        if df is None:
            self.df = pd.read_csv(df_path, sep='\t')
        self.label2idx = {}
        self.idx2label = []
        with open(self.config["idx2labels_path"], "r", encoding="utf-8") as f:
            for idx, label in enumerate(f.readlines()):
                label = label.strip()
                self.label2idx[label] = idx
                self.idx2label.append(label)

        self.pos2idx = {}
        self.idx2pos = []
        with open(self.config["idx2pos_path"], "r", encoding="utf-8") as f:
            for idx, posl in enumerate(f.readlines()):
                pos = posl.strip()
                self.pos2idx[pos] = idx
                self.idx2pos.append(pos)


        if self.config["is_cls"]:
            self.idx2cls = []
            self.cls2idx = {}
            with open(self.config["idx2cls_path"], "r", encoding="utf-8") as f:
                for idx, label in enumerate(f.readlines()):
                    label = label.strip()
                    self.cls2idx[label] = idx
                    self.idx2cls.append(label)
示例#3
0
    def load(self, df_path=None, df=None):
        '''
        # Objective
            - load label2idx from  self.config["idx2labels_path"],
            - load idx2label
            - load **data** and save to instance 'df' attribute
                df is normal pandas dataframe

            - olunlah add char2idx and idx2char

        # Parameter
            - df_path(O)   : 

        # Return
            - None, label2idx and idx2label is setted to instance attribute
        '''
        df_path = if_none(df_path, self.config["df_path"])
        if df is None:
            self.df = pd.read_csv(df_path, sep='\t')
        self.label2idx = {}
        self.idx2label = []
        with open(self.config["idx2labels_path"], "r", encoding="utf-8") as f:
            for idx, label in enumerate(f.readlines()):
                label = label.strip()
                self.label2idx[label] = idx
                self.idx2label.append(label)

        if self.config["is_cls"]:
            self.idx2cls = []
            self.cls2idx = {}
            with open(self.config["idx2cls_path"], "r", encoding="utf-8") as f:
                for idx, label in enumerate(f.readlines()):
                    label = label.strip()
                    self.cls2idx[label] = idx
                    self.idx2cls.append(label)
        #----Olunlah----
        #------- Char2idx Model ------------
        import json
        #CHARIDX_PATH = '/content/drive/My Drive/Colab Notebooks/IS_NER/data/model/Supanut/charidx/char2idx.txt'
        CHARIDX_PATH = '/content/drive/My Drive/Colab Notebooks/IS_NER/data/03_BERT_Thai_NER/in/LST20/char2idx.txt'

        with open(CHARIDX_PATH) as json_file:
            char2idx = json.load(json_file)
        idx2char = {char2idx[c]: c for i , c in enumerate(char2idx)}
        self.char2idx = char2idx
        self.idx2char = idx2char
示例#4
0
 def save(self, df_path=None):
     df_path = if_none(df_path, self.config["df_path"])
     self.df.to_csv(df_path, sep='\t', index=False)
示例#5
0
    def create(cls,
               bert_vocab_file,
               config_path=None,
               train_path=None,
               valid_path=None,
               idx2label=None,
               bert_model_type="bert_cased",
               idx2cls=None,
               max_seq_len=424,
               batch_size=16,
               is_cls=False,
               idx2label_path=None,
               idx2cls_path=None,
               pad="<pad>",
               device="cuda:0",
               clear_cache=True,
               data_columns=["0", "1", "2"],
               shuffle=True,
               dir_config=None,
               prc_text=preprocess_text):
        """
        Create or skip data loaders, load or create vocabs.
        DataFrame should has 2 or 3 columns. Structure see in data_columns description.

        Parameters
        ----------
        bert_vocab_file : str
            Path of vocabulary for BERT tokenizer.
        config_path : str, or None, optional (default=None)
            Path of config of BertNerData.
        train_path : str or None, optional (default=None)
            Path of train data frame. If not None update idx2label, idx2cls, idx2meta.
        valid_path : str or None, optional (default=None)
            Path of valid data frame. If not None update idx2label, idx2cls, idx2meta.
        idx2label : list or None, optional (default=None)
            Map form index to label.
        bert_model_type : str, optional (default="bert_cased")
            Mode of BERT model (CASED or UNCASED).
        idx2cls : list or None, optional (default=None)
            Map form index to cls.
        max_seq_len : int, optional (default=424)
            Max sequence length.
        batch_size : int, optional (default=16)
            Batch size.
        is_cls : bool, optional (default=False)
            Use joint model or single.
        idx2label_path : str or None, optional (default=None)
            Path to idx2label map. If not None and idx2label is None load idx2label.
        idx2cls_path : str or None, optional (default=None)
            Path to idx2cls map. If not None and idx2cls is None load idx2cls.
        pad : str, optional (default="<pad>")
            Padding token.
        device : str, optional (default="cuda:0")
            Run model on gpu or cpu. If "cpu" don't pin tensors in data loaders to gpu.
            Notation similar as torch.cuda.device.
        clear_cache : bool, optional (default=True)
            If True, rewrite all vocabs and BertNerData config.
        data_columns : list[str]
            Columns if pandas.DataFrame.
                data_columns[0] - represent labels column. Each label should be joined by space;
                data_columns[1] - represent tokens column. Input sequence should be tokenized and joined by space;
                data_columns[2] - represent cls column (if is_cls is not None).
        shuffle : bool, optional (default=True)
            Is shuffle data.
        dir_config : str or None, optional (default=None)
            Dir for store vocabs if paths is not set.
        prc_text : callable, optional (default=preprocess_text)
            Function for preprocess text. By default remove some bad unicode words.
            Note. don't see in word. Remove only full match bad symbol with word.

        Returns
        ----------
        data : BertNerData
            Created object of BertNerData.
        """
        idx2label_path = if_none(
            idx2label_path,
            os.path.join(dir_config, "idx2label.json")
            if dir_config is not None else None)

        if idx2label is None and idx2label_path is None:
            raise ValueError("Must set idx2label_path.")

        if bert_model_type == "bert_cased":
            do_lower_case = False
        elif bert_model_type == "bert_uncased":
            do_lower_case = True
        else:
            raise NotImplementedError("No requested mode :(.")

        tokenizer = tokenization.FullTokenizer(vocab_file=bert_vocab_file,
                                               do_lower_case=do_lower_case)

        if idx2label is None and os.path.exists(
                str(idx2label_path)) and not clear_cache:
            idx2label = read_json(idx2label_path)
        if is_cls:
            idx2cls_path = if_none(
                idx2cls_path,
                os.path.join(dir_config, "idx2cls.json")
                if dir_config is not None else None)
        if is_cls and idx2cls is None and os.path.exists(
                str(idx2cls_path)) and not clear_cache:
            idx2cls = read_json(idx2cls_path)

        config_path = if_none(
            config_path,
            os.path.join(dir_config, "data_ner.json")
            if dir_config is not None else None)

        data = cls(bert_vocab_file=bert_vocab_file,
                   train_path=train_path,
                   valid_path=valid_path,
                   idx2label=idx2label,
                   config_path=config_path,
                   tokenizer=tokenizer,
                   bert_model_type=bert_model_type,
                   idx2cls=idx2cls,
                   max_seq_len=max_seq_len,
                   batch_size=batch_size,
                   is_cls=is_cls,
                   idx2label_path=idx2label_path,
                   idx2cls_path=idx2cls_path,
                   pad=pad,
                   device=device,
                   data_columns=data_columns,
                   shuffle=shuffle,
                   prc_text=prc_text)

        if train_path is not None:
            _ = data.load_train_dl(train_path)

        if valid_path is not None:
            _ = data.load_valid_dl(valid_path)

        if clear_cache:
            data.save_vocabs_and_config()
        return data