def extract_semtype_phrase(semtype, values, output_list, ent3, text):
	original_phrase_list=[]
	temp=[]
	
	text = text.replace(',','.')
	split_text = text.split('.')
	
	bionlp = en_ner_bc5cdr_md.load()
	bionlp.add_pipe(bionlp.create_pipe('sentencizer'))
	
	for line in output_list:
		line=line.strip()
		if line:
			temp.append(line)
		if '['+semtype+']' in line and int("".join(filter(str.isdigit, line))) >= 800:
			for rline in temp[::-1]:
				if 'Phrase' in rline:
					rline = rline.strip(" ;").replace("Phrase:", "").replace("'","").replace("[","").replace("]","").strip()
					rline = re.sub('^%s' % ",", "", rline)
					rline = re.sub('%s$' % ",", "", rline)
					if values and any(detail in rline.lower() for detail in values):
						match ='.'.join([s for s in split_text if rline.lower() in s.lower()])
						if 'pupils' in match or 'eyes' in match:
							match = re.sub(r'(.*)((pupils|eyes)[\sa-z,]+)|(.*)',r'\2',match, flags=re.IGNORECASE)
						doc=bionlp(match)						
						for token in doc:
							if any(detail in token.text.lower() for detail in values):
								ent3[token.text.upper()] = get_dependency(match, token)
					
					if rline not in original_phrase_list:
						original_phrase_list.append(rline)
						break
	return original_phrase_list, ent3
Пример #2
0
 def __init__(self):
     self.tagger = en_ner_bc5cdr_md.load()
     self.abbreviation_pipe = AbbreviationDetector(self.tagger)
     self.tagger.add_pipe(self.abbreviation_pipe)
     self.linker = UmlsEntityLinker(resolve_abbreviations=True,
                                    max_entities_per_mention=1)
     self.tagger.add_pipe(self.linker)
     print('NER Module Ready')
Пример #3
0
    def process_item(self, item, spider):
        diseases_list = item['reports'][0]['diseases']
        nlp_bc = en_ner_bc5cdr_md.load()
        doc_bc = nlp_bc(item['main_text'])

        # combine text with its label
        label = {}
        for token in doc_bc.ents:
            label[token.text] = token.label_
        # combine text with its pos
        pos = {}
        for token in doc_bc:
            pos[token.text] = token.pos_
        # combine text with its lemma
        lemma = {}
        for token in doc_bc:
            lemma[token.text] = token.lemma_

        syndromes = []
        for k, v in label.items():
            if v == "DISEASE":
                li = k.split(" ")
                noun = 0
                adj = 0
                adp = 0
                if li[-1].lower() == "coronavirus":
                    continue
                for c in li:
                    if "CoV" in c:
                        break
                    if c.isupper():
                        break
                    if lemma[c] != c:
                        break
                    if "disease" in c:
                        break
                    if c in diseases_list:
                        break

                    if pos.get(c) == "ADJ":
                        adj += 1
                    elif pos.get(c) == "NOUN":
                        noun += 1
                    # "of" case
                    elif pos.get(c) == "ADP":
                        adp += 1
                if adj == 0 and noun >= 1:
                    syndromes.append(k)
                elif adj == 1 and (noun >= 1 and noun <= 2):
                    syndromes.append(k)
                elif adj == 0 and noun >= 1 and adp >= 1:
                    syndromes.append(k)
        gc.collect()
        item['reports'][0]['syndromes'] = syndromes
        return item
def scispacy_plus_tokenizer(sequence: str, scispacy_tok=None) -> Iterator[str]:
    """
    Runs the scispacy tokenizer and removes all tokens with
    just whitespace characters
    """
    if scispacy_tok is None:
        import en_ner_bc5cdr_md
        scispacy_tok = en_ner_bc5cdr_md.load().tokenizer

    scispacy_tokens = list(map(lambda x: str(x), scispacy_tok(sequence)))
    tokens = filter(lambda t: not (' ' in t or '\n' in t or '\t' in t),
                    scispacy_tokens)

    return tokens
def detect_drugs(text_file):
    #NLP model for drug recognition
    nlp = en_ner_bc5cdr_md.load()

    #the input text file
    text = open(text_file, "r").read()

    #use NLP model to parse the text
    parsed_text = nlp(text)

    #extract entities/ drug names
    entities = parsed_text.ents

    #print entities
    print("List of entities:", entities)

    return entities
def information_extractor(text, semantic_dict):
	
	entities, ent2 =getspacy_pattern_matched_entities(text.lower())	
	entities=[t for t in (set(tuple(i) for i in entities))]
	ent2=[t for t in (set(tuple(i) for i in ent2))]
	
	entities = refine_values(entities,text)
	
	ent_list=list(eval(str(entities)))
	ent_list2=list(eval(str(ent2)))
	
	keys = ["AGE", "GENDER", "PULSE", "BP", "RESP", "B.G.L", "SPO2", "GCS", "MENTAL ST", "PATIENT COND"\
			,"MEDICATION","ALLERGIES","PAST MEDICAL HISTORY","PAIN","TRAUMA","PUPILS","LUNG SOUNDS","VERBAL",\
			"AIRWAY", "INJURY","MECHANISM OF INJURY","COMPLAINT","TREATMENT","NOTES"]

	result = {}
	
	for (k, v) in ent_list:
		if k and v:
			result.setdefault(k, []).append(v)
	for key in keys:
		if key not in result.keys():
			result[key] = ['']
	result2 = {}
	for (k, v) in ent_list2:
		result2.setdefault(k, []).append(v)
	
	d2=dict(result2)
	dd = defaultdict(list)

	for d in (semantic_dict, d2): # you can list as many input dicts as you want here
		for key, value in d.items():
			dd[key]= list(set(dd[key] + value))
	bionlp = en_ner_bc5cdr_md.load()
	bionlp.add_pipe(bionlp.create_pipe('sentencizer'))
	return dict(result), dd, semantic_dict
import pandas as pd
import csv
import spacy
import en_ner_bc5cdr_md
from collections import Counter
from spacy import displacy
from spacy.matcher import Matcher
import scispacy

# Apply scispacy model to recognize diseases and chemiscals
# This model referred to works of two authors in Kaggle: https://www.kaggle.com/maria17/cord-19-explore-drugs-being-developed

nlp = en_ner_bc5cdr_md.load()


# define models
def scispacy_model(text, nlp):
    entities = {}
    doc = nlp(str(text.lower()))
    entities[doc.labels] = doc.ents
    return entities  #displacy.render(docs, style="ent", options=options),


text = 'The 2019–20 coronavirus pandemic is an ongoing pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)'

print(scispacy_model(text, nlp))
Пример #8
0
    'DUR': 'Duration',
    'ROU': 'Route',
    'FOR': 'Form',
    'ADE': 'ADE',
    'DOS': 'Dosage',
    'REA': 'Reason',
    'FRE': 'Frequency'
}

# =====BiLSTM + CRF model for NER=========
bilstm_config = BiLSTMConfig()
bilstm_model = BiLSTMModel(bilstm_config)
bilstm_learn = BiLSTMLearner(bilstm_config, bilstm_model)
bilstm_learn.load("ner_15e_bilstm_crf_elmo")

scispacy_tok = en_ner_bc5cdr_md.load().tokenizer
scispacy_plus_tokenizer.__defaults__ = (scispacy_tok, )

# =====BioBERT Model for RE======
re_label_list = ["0", "1"]
re_task_name = "ehr-re"

biobert_re_config = AutoConfig.from_pretrained(os.path.join(
    BIOBERT_RE_MODEL_DIR, "config.json"),
                                               num_labels=len(re_label_list),
                                               finetuning_task=re_task_name)

biobert_re_model = AutoModelForSequenceClassification.from_pretrained(
    os.path.join(BIOBERT_RE_MODEL_DIR, "pytorch_model.bin"),
    config=biobert_re_config,
)
def main():
    args = parse_arguments()

    if args.target_dir[-1] != '/':
        args.target_dir += '/'

    if args.sep == "tab":
        args.sep = "\t"

    if not os.path.isdir(args.target_dir):
        os.mkdir(args.target_dir)

    if args.tokenizer == "default":
        tokenizer = default_tokenizer

    elif args.tokenizer == "scispacy":
        import en_ner_bc5cdr_md
        tokenizer = en_ner_bc5cdr_md.load().tokenizer

    elif args.tokenizer == 'scispacy_plus':
        import en_ner_bc5cdr_md
        scispacy_tok = en_ner_bc5cdr_md.load().tokenizer
        scispacy_plus_tokenizer.__defaults__ = (scispacy_tok, )

        tokenizer = scispacy_plus_tokenizer

    elif args.tokenizer == 'biobert-large':
        from transformers import AutoTokenizer
        biobert = AutoTokenizer.from_pretrained(
            "dmis-lab/biobert-large-cased-v1.1")

        args.max_seq_len -= biobert.num_special_tokens_to_add()
        tokenizer = biobert.tokenize

    elif args.tokenizer == 'biobert-base':
        from transformers import AutoTokenizer
        biobert = AutoTokenizer.from_pretrained(
            "dmis-lab/biobert-base-cased-v1.1")

        args.max_seq_len -= biobert.num_special_tokens_to_add()
        tokenizer = biobert.tokenize

    else:
        warnings.warn("Tokenizer named " + args.tokenizer + " not found."
                      "Using default tokenizer instead. Acceptable values"
                      "include 'scispacy', 'biobert-base', 'biobert-large',"
                      "and 'default'.")
        tokenizer = default_tokenizer

    print("\nReading data\n")
    train_dev, test = read_data(data_dir=args.input_dir,
                                train_ratio=1 - args.test_split,
                                tokenizer=tokenizer,
                                verbose=1)

    if args.ade_dir is not None:
        ade_train_dev, ade_test = read_ade_data(ade_data_dir=args.ade_dir,
                                                train_ratio=1 -
                                                args.test_split,
                                                verbose=1)

        ade_dev_split_idx = int((1 - args.dev_split) * len(ade_train_dev))
        ade_train = ade_train_dev[:ade_dev_split_idx]
        ade_devel = ade_train_dev[ade_dev_split_idx:]

    else:
        ade_train_dev = None
        ade_train = None
        ade_test = None
        ade_devel = None

    print('\n')

    # Data is already shuffled, just split for dev set
    dev_split_idx = int((1 - args.dev_split) * len(train_dev))
    train = train_dev[:dev_split_idx]
    devel = train_dev[dev_split_idx:]

    # Data for NER
    if args.task.lower() == 'ner':
        files = {
            'train': (train, ade_train),
            'train_dev': (train_dev, ade_train_dev),
            'devel': (devel, ade_devel),
            'test': (test, ade_test)
        }

        ner_generator(files, args)

    # Data for RE
    elif args.task.lower() == 're':
        # {dataset_name: (ehr_data, ade_data, is_test, is_label)}
        files = {
            'train': (train, ade_train, False, True),
            'dev': (devel, ade_devel, False, True),
            'test': (test, ade_test, True, False),
            'test_labels': (test, ade_test, True, True)
        }

        re_generator(files, args)
Пример #10
0
import en_core_sci_lg

# NER specific models
import en_ner_craft_md
import en_ner_bc5cdr_md
import en_ner_jnlpba_md
import en_ner_bionlp13cg_md
import en_core_med7_lg
from negspacy.negation import Negex

from spacy import displacy

# %%
TEXT_TAG = "TEXT"
MODELS = {
    "nlp_bc": (en_ner_bc5cdr_md.load(), "CHEMICAL"),
    "nlp_bi": (en_core_med7_lg.load(), "DRUG"),
    "med7": (en_ner_bionlp13cg_md.load(), "SIMPLE_CHEMICAL"),
}


# %%
def show_medical_abbreviation(model, document):
    """
    This function detects and resolves medical abbreviations in word entities

    Parameters:
         model(module): A pretrained biomedical model from ScispaCy(https://allenai.github.io/scispacy/)
         document(str): Document to be processed

    Returns: List of unique abbreviations and their resolution
def refine_entity(ent1,ent2,ent3, text):
	
	bionlp = en_ner_bc5cdr_md.load()
	bionlp.add_pipe(bionlp.create_pipe('sentencizer'))
	bionlp.add_pipe(bionlp.create_pipe('merge_entities'))
	bionlp.add_pipe(bionlp.create_pipe('merge_noun_chunks'))
	
	replaced_txt = text.replace(',','.')
	split_text = replaced_txt.split('.')
	if ent2['Injury or Poisoning']:
		for s in split_text:
			ent1['INJURY'] = ent1['INJURY'] + [x for x in ent2['Injury or Poisoning'] if x.lower() in s.lower()\
							and not any(word in s.lower() for word in ['vehicle','collision','fell','fall','drowning','mechanism','priority'])]
	if ent2['DRUG']:
		ent1['MEDICATION']=ent2['DRUG']	
	if ent2['Medical Device']:
		ent1['TREATMENT']=ent2['Medical Device']	
	dtemp={'COMPLAINT':['complain'], \
	'INJURY':['injury','injuries', 'congestion', 'bump','abrasion','laceration','contusion','broken'\
	'swellling','fracture','scratch','bruise','gash','trauma',' shot', 'wound','entrance'], \
	'MECHANISM OF INJURY':['fell','fall','gunshot','struck','fire','attack', 'collision','assault','stab','hit','suicide','drowning','crash','GSW'],\
	'LUNG SOUNDS':['lung','lungs'],\
	'VERBAL':['confused','groggy'],\
	'PATIENT COND':['stable','unstable','critical'],\
	'MENTAL ST':['awake','alert','disoriented','oriented', 'lethargic','conscious','unconscious','unresponsive','loc','mental','crying'],\
	'TRAUMA':['priority'],\
	'TREATMENT':['immobiliz','high-flow O-2','IV ',' boarded','bag mask', 'IVs ','non-re'],\
	'PUPILS':['PUPILS'],\
	'PAIN':['PAIN'],'PAST MEDICAL HISTORY':['HISTORY'],'ALLERGIES':['allergies','allergie'],'AIRWAY':['airway']}
	for key, value in dtemp.items():
		for v in value:
			if ent3[v.upper()]:
				if ent1[key] and len(ent1[key])>0:
					ent1[key] = [x for x in ent1[key] if x and x.strip()]
					if ent3[v.upper()] not in ent1[key]:
						ent1[key] = list(set(ent1[key] + [ent3[v.upper()]]))
				else:
					ent1[key] = list(set([ent3[v.upper()]]))
			else:
				if key in ['ALLERGIES', 'MECHANISM OF INJURY', 'COMPLAINT', 'MENTAL ST', 'PATIENT COND', 'TRAUMA', 'TREATMENT']:
					if v.lower() == 'stab':
						v = v + '\b'
					ent1[key] = ent1[key] + [f.strip() for f in ent2['Finding'] if re.compile(r'\b'+ v.lower()).search(f.lower()) and '?' not in f]
					ent1[key] = ent1[key] + [s.strip() for s in split_text if re.compile(r'\b'+ v.lower()).search(s.lower()) and '?' not in s]
					ent1[key] = list(set([re.sub(r'(.*):(.*)',r'\2',x) for x in ent1[key] if x and x.strip()]))
				elif key in ['INJURY','PUPILS']:
					no_word_list = ['vehicle','collision','fell','fall','drowning','mechanism','priority','fire','2']
					ent1[key] = ent1[key] + [s.strip() for s in split_text if re.compile(r'\b'+ v.lower()).search(s.lower()) and '?' not in s ]
					matching = '. '.join([s for s in split_text if v.lower() in s.lower() and '?' not in s])
					d2 = bionlp(matching)
					for token in d2:
						if v.lower() in token.text.lower():
							et = get_dependency(matching, token)
							if et not in ent1[key]:
								ent1[key] = list(set(ent1[key] + [get_dependency(matching, token)]))
					ent1[key] = list(set([x for x in ent1[key] if x and x.strip()
								and not any(word in x.lower() for word in no_word_list)]))
				else:
					matching = '. '.join([s for s in split_text if v.lower() in s.lower() and '?' not in s])
					d2 = bionlp(matching)
					for token in d2:
						if v.lower() in token.text.lower():
							if key =='PATIENT COND':
								pc=['patient','everything','he','she'] 
								if token.head.text.lower() in pc or any(item.text.lower() in pc for item in token.children) or any(item.text.lower() in pc for item in token.head.children):
									ent1[key] = list(set(ent1[key] + [get_dependency(matching, token)]))
									ent1[key] = [x for x in ent1[key] if x and x.strip()]
							else:
								ent1[key] = list(set(ent1[key] + [get_dependency(matching, token)]))
								ent1[key] = [x for x in ent1[key] if x and x.strip()]
	
	complaints = ent1['COMPLAINT']+ent1['PAIN']
	ent1['COMPLAINT'] = [x for x in complaints if x]
	for key in ent1:
		temp=ent1[key]
		s=[]
		for e in ent1[key]:
			s = s + [e.strip() for t in temp if e!=t and all(item in t.replace(',','').split(' ') for item in e.replace(',','').split(' ')) or '?' in e]
		ent1[key] = list(set(ent1[key]) - set(s))
	ent1['NOTES']=[text]	
	return ent1