def save_vocabs_and_config(self, idx2label_path=None, idx2cls_path=None, config_path=None): idx2label_path = if_none(idx2label_path, self.idx2label_path) idx2cls_path = if_none(idx2cls_path, self.idx2cls_path) config_path = if_none(config_path, self.config_path) logging.info("Saving vocabs...") save_json(self.idx2label, idx2label_path) if self.idx2cls: save_json(self.idx2cls, idx2cls_path) save_json(self.get_config(), config_path)
def load(self, df_path=None, df=None): df_path = if_none(df_path, self.config["df_path"]) if df is None: self.df = pd.read_csv(df_path, sep='\t') self.label2idx = {} self.idx2label = [] with open(self.config["idx2labels_path"], "r", encoding="utf-8") as f: for idx, label in enumerate(f.readlines()): label = label.strip() self.label2idx[label] = idx self.idx2label.append(label) self.pos2idx = {} self.idx2pos = [] with open(self.config["idx2pos_path"], "r", encoding="utf-8") as f: for idx, posl in enumerate(f.readlines()): pos = posl.strip() self.pos2idx[pos] = idx self.idx2pos.append(pos) if self.config["is_cls"]: self.idx2cls = [] self.cls2idx = {} with open(self.config["idx2cls_path"], "r", encoding="utf-8") as f: for idx, label in enumerate(f.readlines()): label = label.strip() self.cls2idx[label] = idx self.idx2cls.append(label)
def load(self, df_path=None, df=None): ''' # Objective - load label2idx from self.config["idx2labels_path"], - load idx2label - load **data** and save to instance 'df' attribute df is normal pandas dataframe - olunlah add char2idx and idx2char # Parameter - df_path(O) : # Return - None, label2idx and idx2label is setted to instance attribute ''' df_path = if_none(df_path, self.config["df_path"]) if df is None: self.df = pd.read_csv(df_path, sep='\t') self.label2idx = {} self.idx2label = [] with open(self.config["idx2labels_path"], "r", encoding="utf-8") as f: for idx, label in enumerate(f.readlines()): label = label.strip() self.label2idx[label] = idx self.idx2label.append(label) if self.config["is_cls"]: self.idx2cls = [] self.cls2idx = {} with open(self.config["idx2cls_path"], "r", encoding="utf-8") as f: for idx, label in enumerate(f.readlines()): label = label.strip() self.cls2idx[label] = idx self.idx2cls.append(label) #----Olunlah---- #------- Char2idx Model ------------ import json #CHARIDX_PATH = '/content/drive/My Drive/Colab Notebooks/IS_NER/data/model/Supanut/charidx/char2idx.txt' CHARIDX_PATH = '/content/drive/My Drive/Colab Notebooks/IS_NER/data/03_BERT_Thai_NER/in/LST20/char2idx.txt' with open(CHARIDX_PATH) as json_file: char2idx = json.load(json_file) idx2char = {char2idx[c]: c for i , c in enumerate(char2idx)} self.char2idx = char2idx self.idx2char = idx2char
def save(self, df_path=None): df_path = if_none(df_path, self.config["df_path"]) self.df.to_csv(df_path, sep='\t', index=False)
def create(cls, bert_vocab_file, config_path=None, train_path=None, valid_path=None, idx2label=None, bert_model_type="bert_cased", idx2cls=None, max_seq_len=424, batch_size=16, is_cls=False, idx2label_path=None, idx2cls_path=None, pad="<pad>", device="cuda:0", clear_cache=True, data_columns=["0", "1", "2"], shuffle=True, dir_config=None, prc_text=preprocess_text): """ Create or skip data loaders, load or create vocabs. DataFrame should has 2 or 3 columns. Structure see in data_columns description. Parameters ---------- bert_vocab_file : str Path of vocabulary for BERT tokenizer. config_path : str, or None, optional (default=None) Path of config of BertNerData. train_path : str or None, optional (default=None) Path of train data frame. If not None update idx2label, idx2cls, idx2meta. valid_path : str or None, optional (default=None) Path of valid data frame. If not None update idx2label, idx2cls, idx2meta. idx2label : list or None, optional (default=None) Map form index to label. bert_model_type : str, optional (default="bert_cased") Mode of BERT model (CASED or UNCASED). idx2cls : list or None, optional (default=None) Map form index to cls. max_seq_len : int, optional (default=424) Max sequence length. batch_size : int, optional (default=16) Batch size. is_cls : bool, optional (default=False) Use joint model or single. idx2label_path : str or None, optional (default=None) Path to idx2label map. If not None and idx2label is None load idx2label. idx2cls_path : str or None, optional (default=None) Path to idx2cls map. If not None and idx2cls is None load idx2cls. pad : str, optional (default="<pad>") Padding token. device : str, optional (default="cuda:0") Run model on gpu or cpu. If "cpu" don't pin tensors in data loaders to gpu. Notation similar as torch.cuda.device. clear_cache : bool, optional (default=True) If True, rewrite all vocabs and BertNerData config. data_columns : list[str] Columns if pandas.DataFrame. data_columns[0] - represent labels column. Each label should be joined by space; data_columns[1] - represent tokens column. Input sequence should be tokenized and joined by space; data_columns[2] - represent cls column (if is_cls is not None). shuffle : bool, optional (default=True) Is shuffle data. dir_config : str or None, optional (default=None) Dir for store vocabs if paths is not set. prc_text : callable, optional (default=preprocess_text) Function for preprocess text. By default remove some bad unicode words. Note. don't see in word. Remove only full match bad symbol with word. Returns ---------- data : BertNerData Created object of BertNerData. """ idx2label_path = if_none( idx2label_path, os.path.join(dir_config, "idx2label.json") if dir_config is not None else None) if idx2label is None and idx2label_path is None: raise ValueError("Must set idx2label_path.") if bert_model_type == "bert_cased": do_lower_case = False elif bert_model_type == "bert_uncased": do_lower_case = True else: raise NotImplementedError("No requested mode :(.") tokenizer = tokenization.FullTokenizer(vocab_file=bert_vocab_file, do_lower_case=do_lower_case) if idx2label is None and os.path.exists( str(idx2label_path)) and not clear_cache: idx2label = read_json(idx2label_path) if is_cls: idx2cls_path = if_none( idx2cls_path, os.path.join(dir_config, "idx2cls.json") if dir_config is not None else None) if is_cls and idx2cls is None and os.path.exists( str(idx2cls_path)) and not clear_cache: idx2cls = read_json(idx2cls_path) config_path = if_none( config_path, os.path.join(dir_config, "data_ner.json") if dir_config is not None else None) data = cls(bert_vocab_file=bert_vocab_file, train_path=train_path, valid_path=valid_path, idx2label=idx2label, config_path=config_path, tokenizer=tokenizer, bert_model_type=bert_model_type, idx2cls=idx2cls, max_seq_len=max_seq_len, batch_size=batch_size, is_cls=is_cls, idx2label_path=idx2label_path, idx2cls_path=idx2cls_path, pad=pad, device=device, data_columns=data_columns, shuffle=shuffle, prc_text=prc_text) if train_path is not None: _ = data.load_train_dl(train_path) if valid_path is not None: _ = data.load_valid_dl(valid_path) if clear_cache: data.save_vocabs_and_config() return data