Exemplo n.º 1
0
 def create(cls,
            idx2labels_path,
            idx2pos_path,
            df_path=None,
            idx2labels=None,
            idx2cls=None,
            idx2cls_path=None,
            idx2pos=None,
            min_char_len=1,
            model_name="bert-base-multilingual-cased",
            max_sequence_length=424,
            pad_idx=0,
            clear_cache=False,
            is_cls=False,
            markup="IO",
            df=None, tokenizer=None):
     if tokenizer is None:
         #tokenizer = BertTokenizer.from_pretrained(model_name)
         ### modify by prapas: change tokenize to Thai Tokenizer
         bpe_vocab_path = f'/content/drive/My Drive/Colab Notebooks/IS_NER/data/03_BERT_Thai_NER/model/th_wiki_bpe/th.wiki.bpe.op25000.vocab'
         bpe_model_path = f'/content/drive/My Drive/Colab Notebooks/IS_NER/data/03_BERT_Thai_NER/model/th_wiki_bpe/th.wiki.bpe.op25000.model'
         tokenizer = ThaiTokenization.ThaiTokenizer(vocab_file=bpe_vocab_path, spm_file=bpe_model_path)
         ### end - modify by prapas
     config = {
         "min_char_len": min_char_len,
         "model_name": model_name,
         "max_sequence_length": max_sequence_length,
         "clear_cache": clear_cache,
         "df_path": df_path,
         "pad_idx": pad_idx,
         "is_cls": is_cls,
         "idx2labels_path": idx2labels_path,
         "idx2cls_path": idx2cls_path,
         "idx2pos_path": idx2pos_path,
         "markup": markup
     }
     if df is None and df_path is not None:
         df = pd.read_csv(df_path, sep='\t')
     elif df is None:
         if is_cls:
             df = pd.DataFrame(columns=["labels", "text", "pos", "clf"])
         else:
             df = pd.DataFrame(columns=["labels", "text", "pos"])
     if clear_cache:
         _ = cls.create_vocabs(
             df, tokenizer, idx2labels_path, idx2pos_path, markup, idx2cls_path, pad_idx, is_cls, idx2labels, idx2cls, idx2pos)
     self = cls(tokenizer, df=df, config=config, is_cls=is_cls)
     self.load(df=df) #<== need addition code
     return self
Exemplo n.º 2
0
    def create(cls,
               idx2labels_path,
               df_path=None,
               idx2labels=None,
               idx2cls=None,
               idx2cls_path=None,
               min_char_len=1,
               model_name="bert-base-multilingual-cased",
               max_sequence_length=424,
               pad_idx=0,
               clear_cache=False,
               is_cls=False,
               markup="IO",
               df=None, tokenizer=None,
               max_char_length=30
               ):
        '''
        # Objective:
            - Function to be call when create TextDataSet instance
            - This func assign "df" (to store data to instance's attribute when created)
            - Just like __init__ don't know why not just use init

        # Args:
            - many

        # Return:
            - instance itself


        '''
        # 1.) Check Tokenizer
        if tokenizer is None:
            #tokenizer = BertTokenizer.from_pretrained(model_name)
            ### modify by prapas: change tokenize to Thai Tokenizer
            bpe_vocab_path = f'/content/drive/My Drive/Colab Notebooks/IS_NER/data/03_BERT_Thai_NER/model/th_wiki_bpe/th.wiki.bpe.op25000.vocab'
            bpe_model_path = f'/content/drive/My Drive/Colab Notebooks/IS_NER/data/03_BERT_Thai_NER/model/th_wiki_bpe/th.wiki.bpe.op25000.model'
            tokenizer = ThaiTokenization.ThaiTokenizer(vocab_file=bpe_vocab_path, spm_file=bpe_model_path)
            ### end - modify by prapas
        # --- Set config ---
        config = {
            "min_char_len": min_char_len,
            "model_name": model_name,
            "max_sequence_length": max_sequence_length, # max words in a sentence
            "clear_cache": clear_cache,
            "df_path": df_path,
            "pad_idx": pad_idx,
            "is_cls": is_cls,
            "idx2labels_path": idx2labels_path,
            "idx2cls_path": idx2cls_path,
            "markup": markup,
            "max_char_length": max_char_length
        }
        # 2.) Read data from using read_csv --> Store in df
        if df is None and df_path is not None:
            df = pd.read_csv(df_path, sep='\t')
        elif df is None:
            if is_cls:
                df = pd.DataFrame(columns=["labels", "text", "clf"])
            else:
                df = pd.DataFrame(columns=["labels", "text"])
        if clear_cache:
            _ = cls.create_vocabs(
                df, tokenizer, idx2labels_path, markup, idx2cls_path, pad_idx, is_cls, idx2labels, idx2cls)
        self = cls(tokenizer, df=df, config=config, is_cls=is_cls)

        # 3.) Call load to load idx2label,label2idx
        self.load(df=df)
        return self