def fine_tune(pos_action, neg_action, tokenizer, model): nlg_usr = TemplateNLG(is_user=True) nlg_sys = TemplateNLG(is_user=False) pos_train_usr_utter = [] pos_train_sys_utter = [] neg_train_usr_utter = [] neg_train_sys_utter = [] for turn in pos_action: if turn[0] != [] and turn[1] != []: s_u = nlg_usr.generate(turn[0]) s_a = nlg_sys.generate(turn[1]) pos_train_usr_utter.append(s_u) pos_train_sys_utter.append(s_a) for turn in neg_action: if turn[0] != [] and turn[1] != []: s_u = nlg_usr.generate(turn[0]) s_a = nlg_sys.generate(turn[1]) neg_train_usr_utter.append(s_u) neg_train_sys_utter.append(s_a) train_usr_utter = pos_train_usr_utter + neg_train_usr_utter train_sys_utter = pos_train_sys_utter + neg_train_sys_utter train_encoding = tokenizer(train_usr_utter, train_sys_utter, padding=True, truncation=True, max_length=80) train_encoding['label'] = [1] * len(pos_train_usr_utter) + [0] * len( neg_train_usr_utter) train_dataset = Dataset.from_dict(train_encoding) train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) save_dir = os.path.join(root_dir, 'convlab2/policy/dqn/NLE/save/script_fine_tune') log_dir = os.path.join( root_dir, 'convlab2/policy/dqn/NLE/save/script_fine_tune/logs') training_args = TrainingArguments( output_dir=save_dir, num_train_epochs=2, per_device_train_batch_size=32, per_device_eval_batch_size=128, warmup_steps=500, weight_decay=0.01, evaluate_during_training=False, logging_dir=log_dir, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() trainer.save_model(os.path.join(save_dir, 'fine_tune_checkpoint'))
def __call__(self, examples): ds = Dataset.from_dict({'text':examples}) ds = ds.map(lambda batch: self.tokenizer(batch['text'], truncation=True, padding='max_length'), batched=True, batch_size=512) ds.set_format('torch', columns=['input_ids','token_type_ids', 'attention_mask']) dataloader = torch.utils.data.DataLoader(ds, batch_size=16) res = [] for batch in tqdm(dataloader): batch = {k: v.to(self.device) for k, v in batch.items()} outputs = self.model(**batch) res.append(outputs[0].softmax(1).detach().cpu()) return torch.cat(res,dim=0).numpy()
def shuffle(dataset: nlp.Dataset): _ = dataset.shuffle()
def sort(dataset: nlp.Dataset): _ = dataset.sort("numbers")
def select(dataset: nlp.Dataset): _ = dataset.select(range(0, len(dataset), 2))
train_usr_utter, train_sys_utter = generate_data(multiwoz_train) val_usr_utter, val_sys_utter = generate_data(multiwoz_val) test_usr_utter, test_sys_utter = generate_data(multiwoz_test) tokenizer = RobertaTokenizer.from_pretrained("roberta-base") model = RobertaForSequenceClassification.from_pretrained('roberta-base') train_encoding = tokenizer(train_usr_utter, train_sys_utter, padding=True, truncation=True, max_length=80) train_encoding['label'] = [1] * (len(train_usr_utter) // 2) + [0] * (len(train_usr_utter) // 2) train_dataset = Dataset.from_dict(train_encoding) train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) val_encoding = tokenizer(val_usr_utter, val_sys_utter, padding=True, truncation=True, max_length=80) val_encoding['label'] = [1] * (len(val_usr_utter) // 2) + [0] * (len(val_usr_utter) // 2) val_dataset = Dataset.from_dict(val_encoding) val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) test_encoding = tokenizer(test_usr_utter, test_sys_utter, padding=True,
def filter(dataset: nlp.Dataset, **kwargs): _ = dataset.filter(**kwargs)
def map(dataset: nlp.Dataset, **kwargs): _ = dataset.map(**kwargs)
"SentiNews_sentence_test.tsv", sep="\t") df_document_sl_hr_train = pd.read_csv(sentinews_location + "HRSLSentiNews_document_train.tsv", sep="\t") df_document_sl_hr_valid = pd.read_csv(sentinews_location + "HRSLSentiNews_document_valid.tsv", sep="\t") # NO test hr mixed as HR test will be used as final test # gather everyone if you want to have a single DatasetDict document = DatasetDict({ "train": Dataset.from_pandas(df_document_sl_hr_train), "valid": Dataset.from_pandas(df_document_sl_hr_valid), "test": Dataset.from_pandas(df_document_croatian_test) }) # document.save_to_disk("sentinews-document") # gather everyone if you want to have a single DatasetDict paragraph = DatasetDict({ "train": Dataset.from_pandas(df_paragraph_train), "valid": Dataset.from_pandas(df_paragraph_valid), "test": Dataset.from_pandas(df_paragraph_test), }) # paragraph.save_to_disk("sentinews-paragraph") # gather everyone if you want to have a single DatasetDict sentence = DatasetDict({
one_token, one_tag = line_split # 去掉右侧的换行 one_tag = one_tag.rstrip('\n') sentence.append(one_token) sentence_tag.append(one_tag) else: print(f"这一行出现问题{line},不是2个字段") else: #如果是空行,那么说明是下一句了,需要把sentence和 sentence_tag 加入到总的tokens和ner_tags中,然后重置 if sentence and sentence_tag: tokens.append(sentence) ner_tags.append(sentence_tag) sentence = [] sentence_tag = [] if len(tokens) != len(ner_tags): print(f'tokens 和ner_tags的长度不相等,读取的文件有问题,请检查') result = {'tokens': [], 'ner_tags': []} else: result = {'tokens': tokens, 'ner_tags': ner_tags} return result if __name__ == '__main__': train_file = "msra/msra_train_bio.txt" test_file = "msra/msra_test_bio.txt" mini_file = "msra/mini.txt" # test_dict = read_ner_txt(test_file) # dataset = Dataset.from_dict(test_dict) mini_dict = read_ner_txt(mini_file) dataset = Dataset.from_dict(mini_dict) print(dataset)
line_split = line.split('\t') if len(line_split) == 2: one_token, one_tag = line_split # 去掉右侧的换行 one_tag = one_tag.rstrip('\n') sentence.append(one_token) sentence_tag.append(one_tag) else: print(f"这一行出现问题{line},不是2个字段") else: #如果是空行,那么说明是下一句了,需要把sentence和 sentence_tag 加入到总的tokens和ner_tags中,然后重置 if sentence and sentence_tag: tokens.append(sentence) ner_tags.append(sentence_tag) sentence = [] sentence_tag = [] if len(tokens) != len(ner_tags): print(f'tokens 和ner_tags的长度不相等,读取的文件有问题,请检查') result = {'tokens': [], 'ner_tags': []} else: result = {'tokens': tokens, 'ner_tags': ner_tags} return result if __name__ == '__main__': dev_file = "dataset/cosmetics/dev.txt" train_file = "dataset/cosmetics/train.txt" test_file = "dataset/cosmetics/test.txt" test_dict = read_ner_txt(test_file) dataset = Dataset.from_dict(test_dict) print(dataset)
def read_formatted_batch(dataset: nlp.Dataset, length, batch_size, type): with dataset.formatted_as(type=type): for i in range(0, length, batch_size): _ = dataset[i:i + batch_size]
def read_formatted(dataset: nlp.Dataset, length, type): with dataset.formatted_as(type=type): for i in range(length): _ = dataset[i]
def __init__ (self, Xdata=None, Ydata=None, csv=None,xlsx=None,x_col='X',y_col='Y',models='all',test_frac=0.1,train_frac=0.9): if models=='all': self.model_list = [ 'bert-base-uncased', 'albert-base-v2', 'roberta-base', 'linear_SVM', 'multinomial_naive_bayesian',] elif models=='count-vectorizer': self.model_list = [ 'linear_SVM', 'multinomial_naive_bayesian',] elif models=='transformers': self.model_list = [ 'bert-base-uncased', 'albert-base-v2', 'roberta-base',] else: print('Models not recognized, the available options are currently "all", "count-vectorizer", and "transformers"') return if csv!=None and xlsx!= None and Xdata!=None: print("You have provided too much data, give just x and y data, or a csv or xlsx file!") return if csv!=None: csv_data=pd.read_csv(csv) Xdata=csv_data[x_col] Ydata=csv_data[y_col] if xlsx!=None: xlsx_data=pd.read_excel(xlsx) Xdata=xlsx_data[x_col] Ydata=xlsx_data[y_col] if isinstance(Xdata, pd.Series): print('converting pandas series to list') Xdata=list(Xdata) if isinstance(Ydata, pd.Series): print('converting pandas series to list') Ydata=list(Ydata) if Xdata==Ydata==None or (Xdata==None and Ydata!=None) or (Xdata!=None and Ydata==None): print('Either you have not put in your own data, or you have only put in X or Y data, loading default dataset...') self.train_dataset_raw, self.test_dataset_raw = load_dataset('imdb', split=['train', 'test']) X=self.train_dataset_raw['text']+self.test_dataset_raw['text'] Y=self.train_dataset_raw['label']+self.test_dataset_raw['label'] keys=set(Y) else: X=Xdata Y=Ydata if all(isinstance(n, int) for n in Y): keys=set(Y) else: Y,keys=string_labels_to_int(Y) #add method to make min label 0 if min(Y)>=1: Y=[y-min(Y) for y in Y] if len(Xdata)<20: print('dataset is really small, using default test/train split (0.25)') test_frac=None train_frac=None if len(Xdata)<8: print('dataset is really too small, using default test/train split (0.5)') test_frac=0.5 train_frac=0.5 if len(Xdata)!=len(Ydata): print('ERROR: X data and Y data lengths are not the same size, they need to be!') return X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, test_size=test_frac, train_size=train_frac) self.num_labels=len(keys) #self.train_dataset_raw_CNN = TensorDataset(X_train, int_labels_to_list(Y_train,keys)) #self.test_dataset_raw_CNN = TensorDataset(X_test, int_labels_to_list(Y_test,keys)) print('X_train length: ' + str(len(X_train))) print('X_test length: ' + str(len(X_test))) print('Y_train length: ' + str(len(Y_train))) print('Y_test length: ' + str(len(Y_test))) self.train_dataset_raw = Dataset.from_pandas(pd.DataFrame({'text':X_train, 'labels': Y_train})) self.test_dataset_raw = Dataset.from_pandas(pd.DataFrame({'text':X_test, 'labels': Y_test})) self.all_metrics = {}
def train_test_split(dataset: nlp.Dataset): _ = dataset.train_test_split(0.1)
def train( self, training_args: TrainingArguments, train_dataset: nlp.Dataset, eval_dataset: nlp.Dataset, text_col_nm: str = "text", label_col_nm: str = "label", compute_metrics: Callable = None, ) -> None: """Trains and/or finetunes the sequence classification model * **training_args** - Transformers `TrainingArguments` object model * **train_dataset** - Training `Dataset` class object from the nlp library * **eval_dataset** - Eval `Dataset` class object from the nlp library * **text_col_nm** - Name of the text feature column used as training data (Default "text") * **label_col_nm** - Name of the label feature column (Default "label") * **compute_metrics** - Custom metrics function callable for `transformers.Trainer`'s compute metrics * **return** - None """ # Set default metrics if None if not compute_metrics: compute_metrics = self._default_metrics # Set nlp.Dataset label values in sequence classifier configuration ## Important NOTE: Updating configurations do not update the sequence classification head module layer ## We are manually initializing a new linear layer for the "new" labels being trained class_label = train_dataset.features[label_col_nm] config_data = { "num_labels": class_label.num_classes, "id2label": {v: n for v, n in enumerate(class_label.names)}, "label2id": {n: v for v, n in enumerate(class_label.names)}, } self.model.config.update(config_data) self._mutate_model_head(class_label=class_label) # Batch map datasets as torch tensors with tokenizer def tokenize(batch): return self.tokenizer(batch[text_col_nm], padding=True, truncation=True) train_dataset = train_dataset.map( tokenize, batch_size=len(train_dataset), batched=True ) eval_dataset = eval_dataset.map( tokenize, batch_size=len(eval_dataset), batched=True ) train_dataset.set_format( "torch", columns=["input_ids", "attention_mask", label_col_nm] ) eval_dataset.set_format( "torch", columns=["input_ids", "attention_mask", label_col_nm] ) # Instantiate transformers trainer self.trainer = Trainer( model=self.model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Train and serialize self.trainer.train() self.trainer.save_model() self.tokenizer.save_pretrained(training_args.output_dir)
def shard(dataset: nlp.Dataset, num_shards=10): for shard_id in range(num_shards): _ = dataset.shard(num_shards, shard_id)
import pandas as pd import pickle from nlp import Dataset from tensorflow.keras.preprocessing.text import Tokenizer from sklearn.model_selection import train_test_split from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate from tensorflow.keras.models import Model from tensorflow.keras.preprocessing.sequence import pad_sequences from sklearn.preprocessing import LabelBinarizer dataset_path = '../datasets/labeled.csv' dataset = Dataset(dataset_path) dataset.load() dataset.preprocess_texts() num_words = 10000 tokenizer = Tokenizer(num_words=num_words, lower=True) tokenizer.fit_on_texts(dataset.cleaned_data.text) file_to_save = '../datasets/tokenizer.pickle' with file_to_save.open('wb') as file: pickle.dump(tokenizer, file) count = dataset.cleaned_data['text'].str.split().str.len() data = dataset.cleaned_data[count > 1] ia = data[data['label'] == 'This is not norma…'].index ib = data[data['label'] == '!'].index data = data.drop(ia)