def main(): dataset = Dataset('data/annotation', 'inception') train_documents, test_documents = train_test_split(dataset.documents, test_size=0.1, random_state=42) dataset.documents = train_documents dataset.save('conll', path_to_save='data/annotation/train.tsv', sep='\t') dataset.documents = test_documents dataset.save('conll', path_to_save='data/annotation/test.tsv', sep='\t') dataset.save('conll', path_to_save='data/annotation/dev.tsv', sep='\t')
def main(): predicted_test_set = Dataset('../predicted_biobert.txt', 'conll') output_filename = 'entities.tsv' with codecs.open(output_filename, 'w+', encoding='utf-8') as output_file: for x in predicted_test_set.documents: for entity in x.entities: output_file.write(f"{entity.text}\t{entity.type}\n")
def main(): parser = ArgumentParser() parser.add_argument('--input_path', default=r'data_test.json') parser.add_argument('--output_path', default=r'../data/otzovik_conll/test.tsv') parser.add_argument('--input_format', default='json') args = parser.parse_args() input_path = args.input_path input_format = args.input_format output_path = args.output_path output_dir = os.path.dirname(output_path) if not os.path.exists(output_dir) and not output_dir == '': os.makedirs(output_dir) dataset = Dataset(input_path, input_format) dataset.save('conll', path_to_save=output_path, sep='\t')
def main(): parser = ArgumentParser() parser.add_argument('--predicted_path', default='../../rudrec_markup/predicted_biobert.txt') parser.add_argument('--output_path', default=r'entities.json') parser.add_argument('--output_num_docs', default='entities_num_docs.txt') args = parser.parse_args() predicted_path = args.predicted_path output_num_docs_path = args.output_num_docs output_dir = os.path.dirname(output_num_docs_path) if not os.path.exists(output_dir) and not output_dir == '': os.makedirs(output_dir) output_path = args.output_path output_dir = os.path.dirname(output_path) if not os.path.exists(output_dir) and not output_dir == '': os.makedirs(output_dir) predicted_test_set = Dataset(predicted_path, 'conll') with codecs.open(output_path, 'w+', encoding='utf-8') as output_file, \ codecs.open(output_num_docs_path, 'w+', encoding='utf-8') as stats_file: for document in predicted_test_set.documents: doc_dict = {"sent_id": document.doc_id, "sent_text": document.text} entities = [] for entity in document.entities: entity_dict = dataclasses.asdict(entity) del entity_dict["label"] entities.append(entity_dict) doc_dict["entities"] = entities if len(entities) > 0: json.dump(doc_dict, output_file, ensure_ascii=False) output_file.write('\n') stats_file.write( f"Num sentences: {len(predicted_test_set.documents)}\n")
def test_load_dataset(): dataset = Dataset('data/data_conll.txt', 'conll', sep='\t') gold_tokens = [[ '22', '-', 'oxacalcitriol', 'suppresses', 'secondary', 'hyperparathyroidism', 'without', 'inducing', 'low', 'bone', 'turnover', 'in', 'dogs', 'with', 'renal', 'failure', '.' ], [ 'BACKGROUND', ':', 'Calcitriol', 'therapy', 'suppresses', 'serum', 'levels', 'of', 'parathyroid', 'hormone', '(', 'PTH', ')', 'in', 'patients', 'with', 'renal', 'failure', 'but', 'has', 'several', 'drawbacks', ',', 'including', 'hypercalcemia', 'and', '/', 'or', 'marked', 'suppression', 'of', 'bone', 'turnover', ',', 'which', 'may', 'lead', 'to', 'adynamic', 'bone', 'disease', '.' ]] gold_labels = [[ 'O', 'O', 'O', 'O', 'B-DISO', 'I-DISO', 'O', 'O', 'B-DISO', 'I-DISO', 'I-DISO', 'O', 'O', 'O', 'B-DISO', 'I-DISO', 'O' ], [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DISO', 'I-DISO', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DISO', 'O', 'O', 'O', 'O', 'B-DISO', 'I-DISO', 'I-DISO', 'I-DISO', 'O', 'O', 'O', 'O', 'O', 'B-DISO', 'I-DISO', 'I-DISO', 'O' ]] example_id = 0 for tokens, labels in dataset.iterate_token_level(): tokens_length = len(tokens) labels_length = len(labels) assert tokens_length == labels_length, 'Length of tokens and labels mismatch at example ' + str( example_id) assert tokens_length == len( gold_tokens[example_id] ), 'Readed and Gold tokens length mismatch ' + str(example_id) assert labels_length == len( gold_labels[example_id] ), 'Readed and Gold labels length mismatch ' + str(example_id) for token_idx in range(tokens_length): assert tokens[token_idx] == gold_tokens[example_id][ token_idx], 'Token mismatch ' + str(example_id) assert labels[token_idx] == gold_labels[example_id][ token_idx], 'Label mismatch ' + str(example_id) example_id += 1 if example_id == 2: break
def __init__(self, fold_path, fold_type, tokenizer, labeled=True, label2int=None, kwargsDataset={'format': 'brat'}, to_sentences=False, random_state=None, shuffle=False, datasets_iter=None, is_binary=False): ''' fold_path: path to fold folder, must contain corresponding .txt and .ann files fold_type: 'train', 'dev' or 'test' tokenizer: tokenizer to use with dataset kwargsDataset: dict with options for NLPDatasetIO.Dataset to_sentences: whether to split each document into sentences ''' assert fold_type == 'train' or fold_type == 'test' or fold_type == 'dev' if fold_type != 'train' and labeled: assert label2int is not None self.fold_type = fold_type self.fold_path = fold_path if datasets_iter is None: self.documents = Dataset(location=fold_path, split=fold_type, **kwargsDataset).documents else: self.documents = [] for dataset in datasets_iter: self.documents.extend(dataset.documents) if to_sentences and datasets_iter is None: sentences = [] for doc in self.documents: sentences.extend(doc.sentences) self.documents = sentences self.tokenizer = tokenizer self.labeled = labeled self.is_binary = is_binary if self.labeled: if datasets_iter is None: self.labels = [doc.token_labels for doc in self.documents] else: self.labels = [] for dataset in datasets_iter: self.labels.extend(dataset.labels) if self.is_binary: # make it ADR vs Other for idx, doc_labels in enumerate(self.labels): self.labels[idx] = list( map(lambda label: label if 'ADR' in label else 'O', doc_labels)) self.set_label_info(label2int) self.random_state = random_state if random_state is not None: np.random.seed(random_state) self.shuffle = shuffle if shuffle: rng_state = np.random.get_state() np.random.shuffle(self.documents) if self.labeled: np.random.set_state(rng_state) np.random.shuffle(self.labels)
class BratDataset(torch.utils.data.Dataset): def __init__(self, fold_path, fold_type, tokenizer, labeled=True, label2int=None, kwargsDataset={'format': 'brat'}, to_sentences=False, random_state=None, shuffle=False, datasets_iter=None, is_binary=False): ''' fold_path: path to fold folder, must contain corresponding .txt and .ann files fold_type: 'train', 'dev' or 'test' tokenizer: tokenizer to use with dataset kwargsDataset: dict with options for NLPDatasetIO.Dataset to_sentences: whether to split each document into sentences ''' assert fold_type == 'train' or fold_type == 'test' or fold_type == 'dev' if fold_type != 'train' and labeled: assert label2int is not None self.fold_type = fold_type self.fold_path = fold_path if datasets_iter is None: self.documents = Dataset(location=fold_path, split=fold_type, **kwargsDataset).documents else: self.documents = [] for dataset in datasets_iter: self.documents.extend(dataset.documents) if to_sentences and datasets_iter is None: sentences = [] for doc in self.documents: sentences.extend(doc.sentences) self.documents = sentences self.tokenizer = tokenizer self.labeled = labeled self.is_binary = is_binary if self.labeled: if datasets_iter is None: self.labels = [doc.token_labels for doc in self.documents] else: self.labels = [] for dataset in datasets_iter: self.labels.extend(dataset.labels) if self.is_binary: # make it ADR vs Other for idx, doc_labels in enumerate(self.labels): self.labels[idx] = list( map(lambda label: label if 'ADR' in label else 'O', doc_labels)) self.set_label_info(label2int) self.random_state = random_state if random_state is not None: np.random.seed(random_state) self.shuffle = shuffle if shuffle: rng_state = np.random.get_state() np.random.shuffle(self.documents) if self.labeled: np.random.set_state(rng_state) np.random.shuffle(self.labels) def set_label_info(self, label2int): self.label_set = set(['O']) for token_labels in self.labels: self.label_set = self.label_set | set(token_labels) if label2int is None: # learn labels self.label2int = {'O': 0} for idx, label in enumerate(sorted(self.label_set - set(['O'])), 1): self.label2int[label] = idx else: # set labels from other fold self.label2int = label2int self.int2label = {val: key for key, val in self.label2int.items()} self.num_labels = len(self.int2label) def __len__(self): return len(self.documents) def __getitem__(self, idx): if torch.is_tensor(idx): idx = idx.tolist() document = self.documents[idx] #encoded_text = self.tokenizer.encode_plus(document.text, max_length=512) #can't use that robustly because of how NLPDatasetIO works # do it manually, I guess preceding_token_id, trailing_token_id = None, None if isinstance(self.tokenizer, BertTokenizer): preceding_token_id, trailing_token_id = ( self.tokenizer.cls_token_id, self.tokenizer.sep_token_id) if isinstance(self.tokenizer, XLMTokenizer): preceding_token_id, trailing_token_id = ( self.tokenizer.bos_token_id, self.tokenizer.sep_token_id) text_tokens = [token.token for token in document._tokens][:510] encoded_text = {} encoded_text['input_ids'] = ( [preceding_token_id] + self.tokenizer.convert_tokens_to_ids(text_tokens) + [trailing_token_id]) encoded_text['token_type_ids'] = torch.zeros( len(encoded_text['input_ids'])).long() encoded_text['attention_mask'] = torch.ones( len(encoded_text['input_ids'])).long() item = {key: torch.tensor(val) for key, val in encoded_text.items()} if self.labeled: encoded_labels = list( map(lambda elem: self.label2int.get(elem, self.label2int['O']), self.labels[idx][:len(encoded_text['input_ids']) - 2])) labels = [self.label2int['O'] ] + encoded_labels + [self.label2int['O']] item['labels'] = torch.tensor(labels) return item
from argparse import ArgumentParser from NLPDatasetIO.dataset import Dataset if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--conll_data') parser.add_argument('--concept_ids') parser.add_argument('--save_to') args = parser.parse_args() dataset = Dataset(location=args.conll_data, format='conll', sep=' ') with open(args.concept_ids, encoding='utf-8') as input_stream: concept_ids = [line.split()[0] for line in input_stream] idx = 0 for document in dataset.documents: for entity in document.entities: entity.label = concept_ids[idx] idx += 1 dataset.save('json', path_to_save=args.save_to)
def test_load_dataset(): dataset = Dataset('data/brat_format_data', 'brat') for document in dataset.documents: for entity in document.entities: print(entity.start, entity.end, entity.text, entity.type, entity.label)
from argparse import ArgumentParser from NLPDatasetIO.dataset import Dataset if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--conll_data') parser.add_argument('--save_entities_to') args = parser.parse_args() dataset = Dataset(location=args.conll_data, format='conll', sep=' ') with open(args.save_entities_to, 'w', encoding='utf-8') as output_stream: for document in dataset.documents: for entity in document.entities: output_stream.write(f"{entity.text}\n")
def main(): predicted_test_set = Dataset('../results_/predicted_biobert.txt', 'conll') output_path = r'predicted_biobert_sentences' with codecs.open(output_path, 'w+', encoding='utf-8') as output_file: for document in predicted_test_set.documents: output_file.write(f"{document.text.strip()}\n")