Exemplo n.º 1
0
from argparse import Namespace
import tensorflow as tf
import tensorflow_estimator as tfe
from hedgedog.tf.io.dataset import Dataset
from hedgedog.tf.models.multitask_bert_model import MultitaskBertModel
from hedgedog.tf.typing import TensorOrTensorDict
from hedgedog.logging import get_logger

from el.data.dataset import NerDataset
from el.config import model_ing
from el.model.boundary import BoundaryModule
from el.model.normalization import NormalizationModule
from el.model.type import TypingModule, TypeEmbeddingModule
from el.model.cake import CakeModule

log = get_logger("el.model")


class MTBertModel(MultitaskBertModel):
  @model_ing.capture
  def __init__(self, mode: str, hyperparameters: Namespace, bert_model: str, dataset: Dataset = None):
    is_training = mode == "TRAIN"
    prediction_modules = []
    self.module_names = hyperparameters.model.modules
    if 'boundary' in hyperparameters.model.modules:
      prediction_modules.append(BoundaryModule(hyperparameters, is_training))
    if 'norm' in hyperparameters.model.modules:
      prediction_modules.append(NormalizationModule(hyperparameters, is_training))
    if 'cake' in hyperparameters.model.modules:
      cake_model_cons = {
        'basic': CakeModule
Exemplo n.º 2
0
Arquivo: __init__.py Projeto: r-mal/el
from collections import defaultdict
from pathlib import Path
import tensorflow_estimator as tfe
from hedgedog.logging import get_logger
from hedgedog.tf.estimator import train as hdtrain

from el.eval.evaluation import Evaluation
from el.eval.example import Example
from el.eval.span import Span

log = get_logger("el.eval")


def f1_eval(name, gold_spans, predicted_spans, sentence_texts, params):
    exact_eval = Evaluation(set(gold_spans), set(predicted_spans))
    report = f"\nExact Boundary Evaluation\n----------" +\
             f"\nP:  {exact_eval.precision()}" +\
             f"\nR:  {exact_eval.recall()}" +\
             f"\nF1: {exact_eval.f1()}"
    log.info(report)

    partial_eval = Evaluation(set(gold_spans),
                              set(predicted_spans),
                              partial=True)
    partial_report = f"\n----------\nPartial Boundary Evaluation\n----------" +\
                     f"\nP:  {partial_eval.precision()}" +\
                     f"\nR:  {partial_eval.recall()}" +\
                     f"\nF1: {partial_eval.f1()}"
    log.info(partial_report)
    report += partial_report
Exemplo n.º 3
0
Arquivo: cli.py Projeto: r-mal/el
from sacred import Experiment
from hedgedog.tf.estimator.ingredients import *
from hedgedog.tf.io.dataset_util import inspect_tfrecords, iterate_records
from hedgedog.tf.estimator import train as hdtrain
from hedgedog.tf.sacred import convert_to_namespace

from el.data.dataset import NerDataset
# noinspection PyUnresolvedReferences
from el import config as conf
from el.model.model import MTBertModel
from el.data import clef, medmentions
from el.eval.boundary import boundary_eval
from el.eval.entity import end_to_end_eval

logging.reset_handlers()
log = logging.get_logger('el')

ex = Experiment(ingredients=[
    sampling_ingredient, dataset_ingredient, estimator_ingredient,
    conf.model_ing, training_ingredient
])


@ex.command
def train(_run):
    params = convert_to_namespace(_run.config)
    hdtrain.train(_run, model_class=MTBertModel, parameters=params)


@ex.command
def evaluate(_run):
Exemplo n.º 4
0
Arquivo: cake.py Projeto: r-mal/el
from abc import ABC

import tensorflow as tf
import numpy as np
import os
from hedgedog.tf.typing import TensorDict, TensorOrTensorDict
from hedgedog.tf import layers as hdlayers
from hedgedog.tf import metrics as hdmetrics
from hedgedog.logging import get_logger
import hedgedog.tf.models.bert as modeling

from el.model.normalization import NormalizationModule
from el.config import model_ing

log = get_logger("el.model.cake")


class CakeModule(NormalizationModule):
    @model_ing.capture
    def __init__(self,
                 params,
                 is_training,
                 ace_path: str,
                 train_bert: bool,
                 include_cls_sep: bool = True):
        super().__init__(params, is_training)
        # half, just embs, no proj_embs
        self.embedding_size = 50
        self.rnn_num_layers = 1
        self.rnn_hidden_size = 512
        self.train_bert = train_bert
Exemplo n.º 5
0
from typing import List
from pathlib import Path
import traceback
import json
from tqdm import tqdm
from hedgedog.logging import get_logger
from hedgedog.nlp.spacy.umls import UmlsCandidateGenerator
from hedgedog.tf.estimator.ingredients import dataset_ingredient
from el.data.text import Concept, Document, Span

log = get_logger("mm.data.medmentions")


class MedMentionsDocument(Document):
  def __init__(self, lines: List[str], umls, k, mention2idx):
    did, _, title = lines[0].strip().split('|')
    text = '|'.join(lines[1].strip().split('|')[2:])
    doc_string = title + '. ' + text
    entities = []
    for eid, line in enumerate(lines[2:]):
      did, start, end, text, types, cui = line.strip().split('\t')
      cui = cui.replace('UMLS:', '')
      start, end = int(start), int(end)
      if start > len(title):
        start += 1
        end += 1
      entities.append(Concept([Span(start, end, text)], types.split(','), cui))

    super().__init__(did, doc_string, entities, umls, k, mention2idx)

Exemplo n.º 6
0
Arquivo: boundary.py Projeto: r-mal/el
import tensorflow as tf
from hedgedog.tf.estimator.multitask import Module
from hedgedog.tf.typing import TensorDict, TensorOrTensorDict
from hedgedog.nlp.seq_tag import get_tags
from tensorflow.contrib import crf
from hedgedog.tf import layers as hdlayers
from hedgedog.tf import metrics as hdmetrics
from hedgedog.logging import get_logger

from el.config import model_ing

log = get_logger("el.model.boundary")


class BoundaryModule(Module):
    @model_ing.capture
    def __init__(self, params, is_training, verbose_eval, use_bilstm,
                 thiccness):
        super().__init__(params, is_training)
        tags = get_tags(params.dataset.tagset)
        self.boundary2id = tags.tag2id()
        d = len(self.boundary2id)
        self.crf_params = tf.get_variable("crf_params",
                                          shape=[d, d],
                                          dtype=tf.float32)
        self.verbose_eval = verbose_eval
        self.use_bilstm = use_bilstm
        self.thiccness = thiccness
        log.info(
            f"Initialized Boundary Module with tagset: {params.dataset.tagset} {self.boundary2id}"
        )
Exemplo n.º 7
0
import json
from collections import defaultdict
from pathlib import Path
from typing import List, Dict, Callable, Generator
import numpy as np
import tensorflow as tf
from tqdm import tqdm

from hedgedog.logging import get_logger
from hedgedog.nlp.spacy.umls import UmlsCandidateGenerator
from el.data.dataset import overlap
from el.eval import init_model, f1_eval
from el.eval.span import Span, wpid_sequence_to_string

log = get_logger("el.eval.entity")


def end_to_end_eval(model_class, params):
  gold_spans = _load_gold_spans(params)
  sentence_dicts = _load_sentences(params)
  # num_steps = params.dataset.num_test if params.estimator.eval_test_set else params.dataset.num_dev
  # sentence_dicts: Dict[str, Dict[str, None]] = {s['sentence_id']: s
  #                                            for s in tqdm(predict_boundaries(model_class, params), total=num_steps)}

  log.info("Boundary detection done. Running entity linking...")
  predicted_spans = []
  sentence_texts = {}
  id2wp = {v: k for k, v in model_class.dataset().wptokenizer.vocab.items()}
  for sentence in predict_entities(model_class, params, sentence_dicts):
    # noinspection PyTypeChecker
    predicted_spans.extend(sentence['spans'])
Exemplo n.º 8
0
Arquivo: text.py Projeto: r-mal/el
from pathlib import Path
from typing import List
from string import punctuation
from hedgedog.logging import get_logger
from hedgedog.nlp.spacy.umls import UmlsCandidate, UmlsCandidateGenerator

log = get_logger("mm.data.brat")

punctuation = punctuation.replace('\'', '')


class LazySpacy:
    def __init__(self):
        self.spacy = None

    def annotate(self, text):
        if self.spacy is None:
            from hedgedog.nlp.spacy import SpacyAnnotator
            self.spacy = SpacyAnnotator('en_core_sci_sm', 'default',
                                        ['parser'])
        return self.spacy.annotate(text)


spacy = LazySpacy()


class Span:
    def __init__(self, start: int, end: int, text: str):
        self.start = start
        self.end = end
        self.text = text
Exemplo n.º 9
0
from pathlib import Path
from typing import Generator
import json
import numpy as np
from collections import defaultdict
from hedgedog.tf.estimator.ingredients import dataset_ingredient
from hedgedog.tf.io.dataset import FeatureDataset, T
from hedgedog.tf.io.Feature import *
from hedgedog.nlp.wordpiece_tokenization import load_wordpiece_tokenizer
from hedgedog.logging import get_logger
from hedgedog.nlp.spacy.umls import UmlsCandidateGenerator
from hedgedog.nlp.seq_tag import get_tags, ContinuationBoundaryLabeler, IOBESContinuationBoundaryAggregator

log = get_logger("mm.data.dataset")


class NerDataset(FeatureDataset):
    @dataset_ingredient.capture
    def __init__(self, data_dir: str, batch_size: int, bert_model: str,
                 project_dir: str, candidates_per_concept: int,
                 record_dir_name: str, tagset,
                 ignore_sentences_without_concepts, dataset,
                 mention_candidate_path):
        super().__init__(data_dir, batch_size)
        info_dir = Path(project_dir) / 'info'
        self.wptokenizer = load_wordpiece_tokenizer(bert_model)
        self.cui2id = json.load((info_dir / 'cui2id.json').open())
        self.tui2label_id = json.load((info_dir / 'tui2label.json').open())
        self.candidates_per_concept = candidates_per_concept
        self.filter = ignore_sentences_without_concepts
        self.mention2idx = json.load(
Exemplo n.º 10
0
from pathlib import Path
from hedgedog.logging import get_logger
from hedgedog.nlp.spacy.umls import UmlsCandidateGenerator
from hedgedog.tf.estimator.ingredients import dataset_ingredient
from el.data.text import Concept, Document, Span
import json
from tqdm import tqdm
import traceback

log = get_logger("mm.data.clef")


class ClefDocument(Document):
    def __init__(self, data_dir: Path, doc_id: str, umls, k, code2id, id2code,
                 id2types, mention2idx):
        text = (data_dir / f"{doc_id}.text").read_text().replace('\t', ' ')
        concepts = []
        with (data_dir / f"{doc_id}.pipe").open('r') as f:
            for line in f:
                line = line.strip()
                try:
                    fields = line.split('||')
                    _, cui = fields[1], fields[2]
                    spans = []
                    for i in range(3, len(fields), 2):
                        start, end = int(fields[i]), int(fields[i + 1])
                        spans.append(Span(start, end, text[start:end]))
                except ValueError as e:
                    log.error(f"Could not parse line: ##{line}##")
                    log.error(e)
                    traceback.print_exc()
Exemplo n.º 11
0
Arquivo: boundary.py Projeto: r-mal/el
from typing import List, Dict, Generator
from tqdm import tqdm
import json
from pathlib import Path
from hedgedog.logging import get_logger
from hedgedog.nlp.seq_tag import IOBESContinuationBoundaryAggregator

from el.eval import init_model, f1_eval
from el.eval.span import Span, wpid_sequence_to_string

log = get_logger("el.eval.boundary")


def boundary_eval(model_class, params):
  """
  Conducts f1/p/r eval given a list of prediction and label dicts
  """
  log.info("Beginning evaluation...")
  ds = model_class.dataset()

  id2wp = {v: k for k, v in ds.wptokenizer.vocab.items()}
  gold_spans = []
  predicted_spans = []
  sentence_texts = {}
  num_steps = params.dataset.num_test if params.estimator.eval_test_set else params.dataset.num_dev
  sentence_dicts = {}
  text2id = {}

  for sentence in tqdm(predict_boundaries(model_class, params), total=num_steps):
    sid = sentence['sentence_id']
    sentence_texts[sid] = wpid_sequence_to_string(sentence['tokens'], id2wp)