예제 #1
0
def fine_tune(pos_action, neg_action, tokenizer, model):
    nlg_usr = TemplateNLG(is_user=True)
    nlg_sys = TemplateNLG(is_user=False)
    pos_train_usr_utter = []
    pos_train_sys_utter = []
    neg_train_usr_utter = []
    neg_train_sys_utter = []

    for turn in pos_action:
        if turn[0] != [] and turn[1] != []:
            s_u = nlg_usr.generate(turn[0])
            s_a = nlg_sys.generate(turn[1])
            pos_train_usr_utter.append(s_u)
            pos_train_sys_utter.append(s_a)
    for turn in neg_action:
        if turn[0] != [] and turn[1] != []:
            s_u = nlg_usr.generate(turn[0])
            s_a = nlg_sys.generate(turn[1])
            neg_train_usr_utter.append(s_u)
            neg_train_sys_utter.append(s_a)

    train_usr_utter = pos_train_usr_utter + neg_train_usr_utter
    train_sys_utter = pos_train_sys_utter + neg_train_sys_utter

    train_encoding = tokenizer(train_usr_utter,
                               train_sys_utter,
                               padding=True,
                               truncation=True,
                               max_length=80)
    train_encoding['label'] = [1] * len(pos_train_usr_utter) + [0] * len(
        neg_train_usr_utter)
    train_dataset = Dataset.from_dict(train_encoding)
    train_dataset.set_format('torch',
                             columns=['input_ids', 'attention_mask', 'label'])
    save_dir = os.path.join(root_dir,
                            'convlab2/policy/dqn/NLE/save/script_fine_tune')
    log_dir = os.path.join(
        root_dir, 'convlab2/policy/dqn/NLE/save/script_fine_tune/logs')
    training_args = TrainingArguments(
        output_dir=save_dir,
        num_train_epochs=2,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=128,
        warmup_steps=500,
        weight_decay=0.01,
        evaluate_during_training=False,
        logging_dir=log_dir,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )
    trainer.train()
    trainer.save_model(os.path.join(save_dir, 'fine_tune_checkpoint'))
예제 #2
0
 def __call__(self, examples):
   ds = Dataset.from_dict({'text':examples})
   ds = ds.map(lambda batch: self.tokenizer(batch['text'], truncation=True, padding='max_length'), batched=True, batch_size=512)
   ds.set_format('torch', columns=['input_ids','token_type_ids', 'attention_mask'])
   dataloader = torch.utils.data.DataLoader(ds, batch_size=16)
   res = []
   for batch in tqdm(dataloader):
     batch = {k: v.to(self.device) for k, v in batch.items()}
     outputs = self.model(**batch)
     res.append(outputs[0].softmax(1).detach().cpu())
   return torch.cat(res,dim=0).numpy()
def shuffle(dataset: nlp.Dataset):
    _ = dataset.shuffle()
def sort(dataset: nlp.Dataset):
    _ = dataset.sort("numbers")
def select(dataset: nlp.Dataset):
    _ = dataset.select(range(0, len(dataset), 2))
예제 #6
0

train_usr_utter, train_sys_utter = generate_data(multiwoz_train)
val_usr_utter, val_sys_utter = generate_data(multiwoz_val)
test_usr_utter, test_sys_utter = generate_data(multiwoz_test)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
train_encoding = tokenizer(train_usr_utter,
                           train_sys_utter,
                           padding=True,
                           truncation=True,
                           max_length=80)
train_encoding['label'] = [1] * (len(train_usr_utter) //
                                 2) + [0] * (len(train_usr_utter) // 2)
train_dataset = Dataset.from_dict(train_encoding)
train_dataset.set_format('torch',
                         columns=['input_ids', 'attention_mask', 'label'])
val_encoding = tokenizer(val_usr_utter,
                         val_sys_utter,
                         padding=True,
                         truncation=True,
                         max_length=80)
val_encoding['label'] = [1] * (len(val_usr_utter) //
                               2) + [0] * (len(val_usr_utter) // 2)
val_dataset = Dataset.from_dict(val_encoding)
val_dataset.set_format('torch',
                       columns=['input_ids', 'attention_mask', 'label'])
test_encoding = tokenizer(test_usr_utter,
                          test_sys_utter,
                          padding=True,
예제 #7
0
def filter(dataset: nlp.Dataset, **kwargs):
    _ = dataset.filter(**kwargs)
예제 #8
0
def map(dataset: nlp.Dataset, **kwargs):
    _ = dataset.map(**kwargs)
                                   "SentiNews_sentence_test.tsv",
                                   sep="\t")

    df_document_sl_hr_train = pd.read_csv(sentinews_location +
                                          "HRSLSentiNews_document_train.tsv",
                                          sep="\t")
    df_document_sl_hr_valid = pd.read_csv(sentinews_location +
                                          "HRSLSentiNews_document_valid.tsv",
                                          sep="\t")

    # NO test hr mixed as HR test will be used as final test

    # gather everyone if you want to have a single DatasetDict
    document = DatasetDict({
        "train":
        Dataset.from_pandas(df_document_sl_hr_train),
        "valid":
        Dataset.from_pandas(df_document_sl_hr_valid),
        "test":
        Dataset.from_pandas(df_document_croatian_test)
    })
    # document.save_to_disk("sentinews-document")
    # gather everyone if you want to have a single DatasetDict
    paragraph = DatasetDict({
        "train": Dataset.from_pandas(df_paragraph_train),
        "valid": Dataset.from_pandas(df_paragraph_valid),
        "test": Dataset.from_pandas(df_paragraph_test),
    })
    # paragraph.save_to_disk("sentinews-paragraph")
    # gather everyone if you want to have a single DatasetDict
    sentence = DatasetDict({
예제 #10
0
                    one_token, one_tag = line_split
                    # 去掉右侧的换行
                    one_tag = one_tag.rstrip('\n')
                    sentence.append(one_token)
                    sentence_tag.append(one_tag)
                else:
                    print(f"这一行出现问题{line},不是2个字段")
            else:
                #如果是空行,那么说明是下一句了,需要把sentence和 sentence_tag 加入到总的tokens和ner_tags中,然后重置
                if sentence and sentence_tag:
                    tokens.append(sentence)
                    ner_tags.append(sentence_tag)
                    sentence = []
                    sentence_tag = []
    if len(tokens) != len(ner_tags):
        print(f'tokens 和ner_tags的长度不相等,读取的文件有问题,请检查')
        result = {'tokens': [], 'ner_tags': []}
    else:
        result = {'tokens': tokens, 'ner_tags': ner_tags}
    return result


if __name__ == '__main__':
    train_file = "msra/msra_train_bio.txt"
    test_file = "msra/msra_test_bio.txt"
    mini_file = "msra/mini.txt"
    # test_dict = read_ner_txt(test_file)
    # dataset = Dataset.from_dict(test_dict)
    mini_dict = read_ner_txt(mini_file)
    dataset = Dataset.from_dict(mini_dict)
    print(dataset)
예제 #11
0
                line_split = line.split('\t')
                if len(line_split) == 2:
                    one_token, one_tag = line_split
                    # 去掉右侧的换行
                    one_tag = one_tag.rstrip('\n')
                    sentence.append(one_token)
                    sentence_tag.append(one_tag)
                else:
                    print(f"这一行出现问题{line},不是2个字段")
            else:
                #如果是空行,那么说明是下一句了,需要把sentence和 sentence_tag 加入到总的tokens和ner_tags中,然后重置
                if sentence and sentence_tag:
                    tokens.append(sentence)
                    ner_tags.append(sentence_tag)
                    sentence = []
                    sentence_tag = []
    if len(tokens) != len(ner_tags):
        print(f'tokens 和ner_tags的长度不相等,读取的文件有问题,请检查')
        result = {'tokens': [], 'ner_tags': []}
    else:
        result = {'tokens': tokens, 'ner_tags': ner_tags}
    return result


if __name__ == '__main__':
    dev_file = "dataset/cosmetics/dev.txt"
    train_file = "dataset/cosmetics/train.txt"
    test_file = "dataset/cosmetics/test.txt"
    test_dict = read_ner_txt(test_file)
    dataset = Dataset.from_dict(test_dict)
    print(dataset)
예제 #12
0
def read_formatted_batch(dataset: nlp.Dataset, length, batch_size, type):
    with dataset.formatted_as(type=type):
        for i in range(0, length, batch_size):
            _ = dataset[i:i + batch_size]
예제 #13
0
def read_formatted(dataset: nlp.Dataset, length, type):
    with dataset.formatted_as(type=type):
        for i in range(length):
            _ = dataset[i]
예제 #14
0
	def __init__ (self, Xdata=None, Ydata=None, csv=None,xlsx=None,x_col='X',y_col='Y',models='all',test_frac=0.1,train_frac=0.9):
		if models=='all':
			self.model_list = [
				'bert-base-uncased',
        'albert-base-v2',
        'roberta-base',
        'linear_SVM',
        'multinomial_naive_bayesian',]
		elif models=='count-vectorizer':
			self.model_list = [
        'linear_SVM',
        'multinomial_naive_bayesian',]
		elif models=='transformers':
			self.model_list = [
        'bert-base-uncased',
        'albert-base-v2',
        'roberta-base',]
		else:
			print('Models not recognized, the available options are currently "all", "count-vectorizer", and "transformers"')
			return
		if csv!=None and xlsx!= None and Xdata!=None:
			print("You have provided too much data, give just x and y data, or a csv or xlsx file!")
			return
		if csv!=None:
			csv_data=pd.read_csv(csv)
			Xdata=csv_data[x_col]
			Ydata=csv_data[y_col]
		if xlsx!=None:
			xlsx_data=pd.read_excel(xlsx)
			Xdata=xlsx_data[x_col]
			Ydata=xlsx_data[y_col]
		if isinstance(Xdata, pd.Series):
			print('converting pandas series to list')
			Xdata=list(Xdata)
		if isinstance(Ydata, pd.Series):
			print('converting pandas series to list')
			Ydata=list(Ydata)

		if Xdata==Ydata==None or (Xdata==None and Ydata!=None) or (Xdata!=None and Ydata==None):
			print('Either you have not put in your own data, or you have only put in X or Y data, loading default dataset...')
			self.train_dataset_raw, self.test_dataset_raw = load_dataset('imdb', split=['train', 'test'])
			X=self.train_dataset_raw['text']+self.test_dataset_raw['text']
			Y=self.train_dataset_raw['label']+self.test_dataset_raw['label']
			keys=set(Y)
		else:
			X=Xdata
			Y=Ydata
			if all(isinstance(n, int) for n in Y):
				keys=set(Y)
			else:
				Y,keys=string_labels_to_int(Y)
    #add method to make min label 0
			if min(Y)>=1:
				Y=[y-min(Y) for y in Y]
		if len(Xdata)<20:
		  print('dataset is really small, using default test/train split (0.25)')
		  test_frac=None
		  train_frac=None
		if len(Xdata)<8:
		  print('dataset is really too small, using default test/train split (0.5)')
		  test_frac=0.5
		  train_frac=0.5

		if len(Xdata)!=len(Ydata):
		  print('ERROR: X data and Y data lengths are not the same size, they need to be!')
		  return

		X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                        stratify=Y, 
                                                        test_size=test_frac,
                                                        train_size=train_frac)
		self.num_labels=len(keys)
		#self.train_dataset_raw_CNN = TensorDataset(X_train, int_labels_to_list(Y_train,keys))
		#self.test_dataset_raw_CNN = TensorDataset(X_test, int_labels_to_list(Y_test,keys))
		print('X_train length: ' + str(len(X_train)))
		print('X_test length: ' + str(len(X_test)))
		print('Y_train length: ' + str(len(Y_train)))
		print('Y_test length: ' + str(len(Y_test)))   
		self.train_dataset_raw = Dataset.from_pandas(pd.DataFrame({'text':X_train, 'labels': Y_train}))
		self.test_dataset_raw = Dataset.from_pandas(pd.DataFrame({'text':X_test, 'labels': Y_test}))	
		self.all_metrics = {}
예제 #15
0
def train_test_split(dataset: nlp.Dataset):
    _ = dataset.train_test_split(0.1)
예제 #16
0
    def train(
        self,
        training_args: TrainingArguments,
        train_dataset: nlp.Dataset,
        eval_dataset: nlp.Dataset,
        text_col_nm: str = "text",
        label_col_nm: str = "label",
        compute_metrics: Callable = None,
    ) -> None:
        """Trains and/or finetunes the sequence classification model

        * **training_args** - Transformers `TrainingArguments` object model
        * **train_dataset** - Training `Dataset` class object from the nlp library
        * **eval_dataset** - Eval `Dataset` class object from the nlp library
        * **text_col_nm** - Name of the text feature column used as training data (Default "text")
        * **label_col_nm** - Name of the label feature column (Default "label")
        * **compute_metrics** - Custom metrics function callable for `transformers.Trainer`'s compute metrics
        * **return** - None
        """
        # Set default metrics if None
        if not compute_metrics:
            compute_metrics = self._default_metrics

        # Set nlp.Dataset label values in sequence classifier configuration
        ## Important NOTE: Updating configurations do not update the sequence classification head module layer
        ## We are manually initializing a new linear layer for the "new" labels being trained
        class_label = train_dataset.features[label_col_nm]
        config_data = {
            "num_labels": class_label.num_classes,
            "id2label": {v: n for v, n in enumerate(class_label.names)},
            "label2id": {n: v for v, n in enumerate(class_label.names)},
        }
        self.model.config.update(config_data)
        self._mutate_model_head(class_label=class_label)

        # Batch map datasets as torch tensors with tokenizer
        def tokenize(batch):
            return self.tokenizer(batch[text_col_nm], padding=True, truncation=True)

        train_dataset = train_dataset.map(
            tokenize, batch_size=len(train_dataset), batched=True
        )
        eval_dataset = eval_dataset.map(
            tokenize, batch_size=len(eval_dataset), batched=True
        )
        train_dataset.set_format(
            "torch", columns=["input_ids", "attention_mask", label_col_nm]
        )
        eval_dataset.set_format(
            "torch", columns=["input_ids", "attention_mask", label_col_nm]
        )

        # Instantiate transformers trainer
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )

        # Train and serialize
        self.trainer.train()
        self.trainer.save_model()
        self.tokenizer.save_pretrained(training_args.output_dir)
예제 #17
0
def shard(dataset: nlp.Dataset, num_shards=10):
    for shard_id in range(num_shards):
        _ = dataset.shard(num_shards, shard_id)
예제 #18
0
파일: training.py 프로젝트: dhvalden/phd
import pandas as pd
import pickle
from nlp import Dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer

dataset_path = '../datasets/labeled.csv'
dataset = Dataset(dataset_path)
dataset.load()
dataset.preprocess_texts()

num_words = 10000

tokenizer = Tokenizer(num_words=num_words, lower=True)
tokenizer.fit_on_texts(dataset.cleaned_data.text)

file_to_save = '../datasets/tokenizer.pickle'
with file_to_save.open('wb') as file:
    pickle.dump(tokenizer, file)

count = dataset.cleaned_data['text'].str.split().str.len()
data = dataset.cleaned_data[count > 1]
ia = data[data['label'] == 'This is not norma…'].index
ib = data[data['label'] == '!'].index
data = data.drop(ia)