def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs): dataset = datasets.load_dataset( datasets.Bio, data_dir=data_dir, cache_dir=data_dir, data_files=datasets.Bio.default_files(data_dir)) dataset.rename_column_('bio', 'labels') dataset = dataset.map( lambda examples: tokenize(examples, tokenizer, max_length), batched=True, cache_file_names={ k: d._get_cache_file_path(f"{task_name}-{k}-tokenized") for k, d in dataset.items() }) dataset = dataset.filter( lambda x: not x['overflow'], cache_file_names={ k: d._get_cache_file_path(f"{task_name}-{k}-filtered") for k, d in dataset.items() }) dataset.set_format(type='torch', columns=[ 'input_ids', 'token_type_ids', 'attention_mask', 'word_index', 'word_attention_mask', 'labels' ]) return dataset
def build_dataset(data_dir, task_name, tokenizer, max_length=512, **kwargs): dataset = datasets.load_dataset( datasets.Conllu, data_dir=data_dir, cache_dir=data_dir, data_files=datasets.Conllu.default_files(data_dir)) dataset.remove_columns_( ["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"]) dataset = dataset.map( lambda examples: tokenize(examples, tokenizer, max_length), batched=True, cache_file_names={ k: d._get_cache_file_path(f"{task_name}-{k}-tokenized") for k, d in dataset.items() }) dataset = dataset.filter( lambda x: not x['overflow'], cache_file_names={ k: d._get_cache_file_path(f"{task_name}-{k}-filtered") for k, d in dataset.items() }) dataset.set_format(type='torch', columns=[ 'input_ids', 'token_type_ids', 'attention_mask', 'word_index', 'word_attention_mask', 'head', 'labels' ]) return dataset
def build_dataset(model, data_dir): dataset = datasets.load_dataset(datasets.Conllu, data_dir=data_dir, cache_dir=data_dir, xpos=os.path.join(data_dir, "xpos_labels.txt")) dataset.remove_columns_( ["id", "lemma", "upos", "feats", "head", "deprel", "deps", "misc"]) dataset.rename_column_('xpos', 'labels') tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True) def tokenize(examples): res = tokenizer( examples['form'], is_split_into_words=True, max_length=model.transformer.config.max_position_embeddings, truncation=True) labels = [] logits_mask = [] for encoding, labels_ in zip(res.encodings, examples['labels']): labels.append([]) logits_mask.append([]) last_word_idx = -1 labels_pointer = -1 for word_idx in encoding.words[1:-1]: if word_idx != last_word_idx: logits_mask[-1].append(True) labels_pointer += 1 labels[-1].append(labels_[labels_pointer]) else: logits_mask[-1].append(False) labels[-1].append(labels_[labels_pointer]) last_word_idx = word_idx res['labels'] = labels res['logits_mask'] = logits_mask return res dataset = dataset.map(lambda examples: tokenize(examples), batched=True, cache_file_names={ k: d._get_cache_file_path(f"{k}-tokenized") for k, d in dataset.items() }) dataset.set_format(type='torch', columns=[ 'input_ids', 'token_type_ids', 'attention_mask', 'logits_mask', 'labels' ]) dataset.shuffle( indices_cache_file_names={ k: d._get_cache_file_path( f"{task_info.task_name}-{k}-shuffled-index-{model.hparams.seed}" ) for k, d in dataset.items() }) return dataset, None
def build_dataset(model, data_dir): dataset = datasets.load_dataset(datasets.Bio, data_dir=data_dir, cache_dir=data_dir, bio=os.path.join(data_dir, "ner_labels.txt")) dataset.rename_column_('bio', 'labels') tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True) def tokenize(examples): res = tokenizer( examples['words'], is_split_into_words=True, max_length=model.transformer.config.max_position_embeddings, truncation=True) word_index = [] word_attention_mask = [] for encoding in res.encodings: word_index.append([]) word_attention_mask.append([]) last_word_idx = -1 current_length = 0 for word_idx in encoding.words[1:-1]: if word_idx != last_word_idx: word_index[-1].append(current_length) word_attention_mask[-1].append(True) current_length += 1 last_word_idx = word_idx res['word_index'] = word_index res['word_attention_mask'] = word_attention_mask return res dataset = dataset.map( lambda examples: tokenize(examples), batched=True, cache_file_names={ k: d._get_cache_file_path(f"{task_info.task_name}-{k}-tokenized") for k, d in dataset.items() }) dataset.set_format(type='torch', columns=[ 'input_ids', 'token_type_ids', 'attention_mask', 'word_index', 'word_attention_mask', 'labels' ]) dataset.shuffle( indices_cache_file_names={ k: d._get_cache_file_path( f"{task_info.task_name}-{k}-shuffled-index-{model.hparams.seed}" ) for k, d in dataset.items() }) return dataset, ( f1_score, dataset[datasets.Split.TRAIN].features['labels'].feature.names)
def build_dataset(model, data_dir): dataset = datasets.load_dataset( datasets.Conllu, data_dir=data_dir, cache_dir=data_dir, deprel=os.path.join(data_dir, "dep_labels.txt") ) dataset.remove_columns_(["id", "lemma", "upos", "xpos", "feats", "deps", "misc"]) dataset.rename_column_('deprel', 'labels') tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True) # {'B':1, 'I':0} def tokenize(examples): res = tokenizer( examples['form'], is_split_into_words=True, max_length=model.transformer.config.max_position_embeddings, truncation=True ) word_index = [] word_attention_mask = [] for encoding in res.encodings: word_index.append([]) word_attention_mask.append([]) last_word_idx = -1 current_length = 0 for word_idx in encoding.words[1:-1]: if word_idx != last_word_idx: word_index[-1].append(current_length) word_attention_mask[-1].append(True) current_length += 1 last_word_idx = word_idx res['word_index'] = word_index res['word_attention_mask'] = word_attention_mask return res dataset = dataset.map( lambda examples: tokenize(examples), batched=True, cache_file_names={ k: d._get_cache_file_path(f"{task_info.task_name}-{k}-tokenized") for k, d in dataset.items() } ) dataset.set_format(type='torch', columns=[ 'input_ids', 'token_type_ids', 'attention_mask', 'word_index', 'word_attention_mask', 'head', 'labels' ]) dataset.shuffle( indices_cache_file_names={ k: d._get_cache_file_path(f"{task_info.task_name}-{k}-shuffled-index-{model.hparams.seed}") for k, d in dataset.items() } ) return dataset, None
def build_dataset(model: Model, data_dir, task_name): dataset = datasets.load_dataset(datasets.Conllu, data_dir=data_dir, cache_dir=data_dir) dataset.remove_columns_([ "id", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" ]) tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True) # {'B':1, 'I':0} def tokenize(examples): res = tokenizer( examples['form'], is_split_into_words=True, max_length=model.transformer.config.max_position_embeddings, truncation=True) labels = [] for encoding in res.encodings: labels.append([]) last_word_idx = -1 for word_idx in encoding.words[1:-1]: labels[-1].append(int(word_idx != last_word_idx)) last_word_idx = word_idx res['labels'] = labels return res dataset = dataset.map( lambda examples: tokenize(examples), batched=True, cache_file_names={ k: d._get_cache_file_path(f"{task_name}-{k}-tokenized") for k, d in dataset.items() }) dataset.set_format( type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels']) dataset.shuffle( indices_cache_file_names={ k: d._get_cache_file_path( f"{task_name}-{k}-shuffled-index-{model.hparams.seed}") for k, d in dataset.items() }) return dataset
def build_dataset(model, data_dir): dataset = datasets.load_dataset(datasets.Conllu, data_dir=data_dir, cache_dir=data_dir, deps=os.path.join(data_dir, "deps_labels.txt")) dataset.remove_columns_( ["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"]) tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True) # {'B':1, 'I':0} def tokenize(examples): res = tokenizer( examples['form'], is_split_into_words=True, max_length=model.transformer.config.max_position_embeddings, truncation=True) word_index = [] word_attention_mask = [] for encoding in res.encodings: word_index.append([]) word_attention_mask.append([]) last_word_idx = -1 current_length = 0 for word_idx in encoding.words[1:-1]: if word_idx != last_word_idx: word_index[-1].append(current_length) word_attention_mask[-1].append(True) current_length += 1 last_word_idx = word_idx res['word_index'] = word_index res['word_attention_mask'] = word_attention_mask heads = [] labels = [] for forms, deps in zip(examples['form'], examples['deps']): sentence_len = len(forms) heads.append( np.zeros((sentence_len, sentence_len + 1), dtype=np.int64)) labels.append( np.zeros((sentence_len, sentence_len + 1), dtype=np.int64)) for idx, head, rel in zip(deps['id'], deps['head'], deps['rel']): heads[-1][idx, head] = 1 labels[-1][idx, head] = rel res['head'] = heads res['labels'] = labels return res dataset = dataset.map(lambda examples: tokenize(examples), batched=True, cache_file_names={ k: d._get_cache_file_path(f"{k}-tokenized") for k, d in dataset.items() }) dataset.set_format(type='torch', columns=[ 'input_ids', 'token_type_ids', 'attention_mask', 'word_index', 'word_attention_mask', 'head', 'labels' ]) dataset.shuffle() return dataset, get_graph_entities
def build_dataset(model: Model, data_dir, task_name): dataset = datasets.load_dataset(datasets.Srl, data_dir=data_dir, cache_dir=data_dir, labels=os.path.join( data_dir, "srl_labels.txt")) tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True) # {'B':1, 'I':0} def tokenize(examples): res = tokenizer( examples['words'], is_split_into_words=True, max_length=model.transformer.config.max_position_embeddings, truncation=True) word_index = [] for encoding in res.encodings: word_index.append([]) last_word_idx = -1 current_length = 0 for word_idx in encoding.words[1:-1]: if word_idx != last_word_idx: word_index[-1].append(current_length) current_length += 1 last_word_idx = word_idx res['word_index'] = word_index res['word_attention_mask'] = [[True] * len(index) for index in word_index] labels = [] for predicates, roles in zip(examples['predicate'], examples['roles']): sentence_len = len(predicates) labels.append( np.zeros((sentence_len, sentence_len), dtype=np.int64)) for idx, predicate in enumerate(predicates): if predicate != '_': srl = np.asarray(roles.pop(0), dtype=np.int64) labels[-1][idx, :] = srl res['labels'] = labels return res dataset = dataset.map(lambda examples: tokenize(examples), batched=True, cache_file_names={ k: d._get_cache_file_path(f"{k}-tokenized") for k, d in dataset.items() }) dataset.set_format(type='torch', columns=[ 'input_ids', 'token_type_ids', 'attention_mask', 'word_index', 'word_attention_mask', 'labels' ]) dataset.shuffle( indices_cache_file_names={ k: d._get_cache_file_path( f"{task_name}-{k}-shuffled-index-{model.hparams.seed}") for k, d in dataset.items() }) return dataset
def build_dataset(model: Model, data_dir, task_name): dataset = datasets.load_dataset(datasets.Conllu, data_dir=data_dir, cache_dir=data_dir) dataset.remove_columns_( ["id", "lemma", "upos", "xpos", "feats", "head", "deprel", "misc"]) tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True) # {'B':1, 'I':0} def tokenize(examples): res = tokenizer( examples['form'], is_split_into_words=True, max_length=model.transformer.config.max_position_embeddings, truncation=True) word_index = [] for encoding in res.encodings: word_index.append([]) last_word_idx = -1 current_length = 0 for word_idx in encoding.words[1:-1]: if word_idx != last_word_idx: word_index[-1].append(current_length) current_length += 1 last_word_idx = word_idx res['word_index'] = word_index res['word_attention_mask'] = [[True] * len(index) for index in word_index] heads = [] labels = [] for forms, deps in zip(examples['form'], examples['deps']): sentence_len = len(forms) heads.append([[0 for j in range(sentence_len + 1)] for i in range(sentence_len)]) labels.append([[0 for j in range(sentence_len + 1)] for i in range(sentence_len)]) for idx, head, rel in zip(deps['id'], deps['head'], deps['rel']): heads[-1][idx][head] = 1 labels[-1][idx][head] = rel res['head'] = heads res['labels'] = labels for word_index, head in zip(res['word_index'], res['head']): assert len(word_index) == len(head) return res dataset = dataset.map( lambda examples: tokenize(examples), batched=True, cache_file_names={ k: d._get_cache_file_path(f"{task_name}-{k}-tokenized") for k, d in dataset.items() }) dataset.set_format(type='torch', columns=[ 'input_ids', 'token_type_ids', 'attention_mask', 'word_index', 'word_attention_mask', 'head', 'labels' ]) dataset.shuffle( indices_cache_file_names={ k: d._get_cache_file_path( f"{task_name}-{k}-shuffled-index-{model.hparams.seed}") for k, d in dataset.items() }) return dataset