def analyzeReport(self, report): """ given an individual radiology report, creates a pyConTextGraph object that contains the context markup report: a text string containing the radiology reports """ context = pyConText.ConTextDocument() targets = self.targets modifiers = self.modifiers splitter = helpers.sentenceSplitter() sentences = splitter.splitSentences(report) count = 0 for s in sentences: #print s markup = pyConText.ConTextMarkup() markup.setRawText(s) markup.cleanText() markup.markItems(modifiers, mode="modifier") markup.markItems(targets, mode="target") markup.pruneMarks() markup.dropMarks('Exclusion') markup.applyModifiers() context.addMarkup(markup) context.computeDocumentGraph(verbose=True) return context
def analyzeSentence(sentence, targets=targets, modifiers=modifiers): sentence = sentenceNLP.preprocess(sentence, "suicide") print sentence counter = 0 counter += 1 # print "sentence no: "+str(counter)+" - "+sentence context = pyConText.ConTextDocument() markup = pyConText.ConTextMarkup() markup.setRawText(sentence) markup.markItems(modifiers, mode="modifier") markup.markItems(targets, mode="target") markup.pruneMarks() markup.dropMarks('Exclusion') markup.applyModifiers() markup.dropInactiveModifiers() context.addMarkup(markup) g = context.getDocumentGraph() ma = g.getMarkedTargets() print g # if len(ma)==0: # print sentence for te in ma: print te return getNegationValue(g, te) return None
def create_markup(s, modifiers=None, targets=None, span=None, prune_inactive=True): """Creates a markup object from a sentence. s is a sentence from a list of a split report. span is the tuple of the span of the sentence. Optional. Returns a named tuple where markup=markup, span=span """ markup = pyConText.ConTextMarkup() markup.setRawText(s) if not span: #Creates a default docSpan if you're just splitting a list span = (0, len(s)) markup.docSpan = span markup.cleanText() markup.markItems(modifiers, mode="modifier") markup.markItems(targets, mode="target") markup.pruneMarks() markup.dropMarks('Exclusion') markup.applyModifiers() markup.pruneSelfModifyingRelationships() if prune_inactive: markup.dropInactiveModifiers() return markup #def evaluate_report(markups, annotations): """
def _annotateSentences(sentenceList, targets, modifiers, modifierToClassMap, annotationGroup): """ Takes a list of sentence objects that all belong to the same document and returns a list of tuples, all of the form (<PyConText Node>, <Sentence Object>, <MentionLevelAnnotation>). If isinstance(sentenceList, DocumentPlaceholder) this function returns an empty list. Similarly, if no annotations are produced by processing the sentences, this function returns an empty list. """ annotationTrioTuples = [] if isinstance(sentenceList, DocumentPlaceholder): return annotationTrioTuples for sentence in sentenceList: markup = pyConText.ConTextMarkup() markup.setRawText(sentence.text) markup.cleanText() markup.markItems(modifiers, mode="modifier") markup.markItems(targets, mode="target") markup.pruneMarks() markup.applyModifiers() markup.pruneSelfModifyingRelationships() markup.dropInactiveModifiers() # Collect all the nodes before making MentionLevelAnnotation objects to check for duplicate nodes. This is # necessary in case the TargetSpanSplitter was used. for node in markup.nodes(): if node.getCategory()[0] == 'target': # Give annotationId placeholder with specific number to be assigned at the end of the function when we # know how many unique annotations we have. annotationId = "pyConTextNLP_Instance_" attributes = { "certainty": "definite" } annotationClass = None if markup.isModifiedByCategory(node, "NEGATED_EXISTENCE") and markup.isModifiedByCategory(node, "AFFIRMED_EXISTENCE"): # Currently any node that is marked as both affirmed and negated is considered negated. print( "Node is modified by both NEGATED_EXISTENCE and AFFIRMED_EXISTENCE....hmmmm.\n\nNote: %s\nSentence: %s" % ( sentence.documentName, sentence.text)) annotationClass = modifierToClassMap["NEGATED_EXISTENCE"] elif markup.isModifiedByCategory(node, "NEGATED_EXISTENCE"): annotationClass = modifierToClassMap["NEGATED_EXISTENCE"] # If the node is not modified by NEGATED_EXISTENCE assume it is modified by AFFIRMED_EXISTENCE or it # is a target with no modifier and consider it a bleeding_present annotation. else: annotationClass = modifierToClassMap["AFFIRMED_EXISTENCE"] predecessorList = markup.predecessors(node) predecessorPhrases = [] for predecessor in predecessorList: predecessorPhrases.append(predecessor.getPhrase()) targetDict = {"modifiers": predecessorPhrases, "target": node.getPhrase()} newAnnotation = MentionLevelAnnotation(sentence.text, sentence.documentSpan[0], sentence.documentSpan[1], "pyConTextNLP", annotationId, attributes, annotationClass, dynamicProperties=targetDict) annotationTrioTuples.append(AnnotationTrio(node, sentence, newAnnotation)) return annotationTrioTuples
def markup_sentence_extract(self, sentence, targets): markup = pyConTextGraph.ConTextMarkup() txt = sentence.lower() markup.setRawText(txt) markup.graph["__txt"] = txt markup.graph["__scope"] = (0, len(txt)) markup.markItems(targets, mode="target") markup.pruneMarks() markup.dropMarks('Exclusion') markup.pruneSelfModifyingRelationships() return markup
def analyzeReport(self,csv,eHOST, idName,report, modFilters = ['indication','pseudoneg','probable_negated_existence', 'definite_negated_existence', 'probable_existence', 'definite_existence','future', 'historical', 'cigarette_units', 'frequency', 'amount', 'current', 'past', 'cessation', "initiation","pack_year", ] ): """given an individual radiology report, creates a pyConTextSql object that contains the context markup report: a text string containing the radiology reports mode: which of the pyConText objects are we using: disease modFilters: """ self.context = pyConText.ConTextDocument() targets=self.targets modifiers = self.modifiers if modFilters == None : modFilters = ['indication','pseudoneg','probable_negated_existence', 'definite_negated_existence', 'probable_existence', 'definite_existence', 'future', 'historical', 'cigarette_units', 'frequency', 'amount', 'current', 'past', 'cessation', "initiation","pack_year", ] fo=open(os.getcwd()+"/eHOST_FILES/corpus/%s"%idName, "w") fo.write(report.strip()) fo.close() splitter = helpers.sentenceSplitter() sentences = splitter.splitSentences(report) count = 0 for s in sentences: markup=pyConText.ConTextMarkup() markup.setRawText(s) markup.cleanText() markup.markItems(modifiers, mode="modifier") markup.markItems(targets, mode="target") markup.pruneMarks() markup.applyModifiers() markup.dropInactiveModifiers() count += 1 self.context.addMarkup(markup) idName, sevFlag, htmlStr = html.mark_document_with_html(csv, eHOST, idName,self.context) self.outString+= self.context.getXML()+u"\n" print self.context.getXML()#;raw_input() return idName, sevFlag, htmlStr
def analyzeSentence(sentence, targets=targets, modifiers=modifiers, tagExperiencer=False): #sentence = sentenceNLP.preprocess(sentence, "suicide") counter = 0 counter += 1 # print "sentence no: "+str(counter)+" - "+sentence context = pyConText.ConTextDocument() markup = pyConText.ConTextMarkup() markup.setRawText(sentence) markup.markItems(modifiers, mode="modifier") markup.markItems(targets, mode="target") print markup.getConTextModeNodes('modifier') markup.pruneMarks() markup.dropMarks('Exclusion') markup.applyModifiers() markup.dropInactiveModifiers() markup.updateScopes() context.addMarkup(markup) g = context.getDocumentGraph() #print "graph: ",g ma = g.getMarkedTargets() # if len(ma)==0: # print sentence tst = [] details = [] found = {} for te in ma: #print ma tmp1, tmp2 = getNegationValue(g, te) if tagExperiencer: e1, e2 = getExperiencerValue(g, te) if e1 != 'Other': #print e1 #print sentence tst.append(tmp1) details.append(tmp2) found[tmp2] = Counter(tmp1) else: tst.append(tmp1) details.append(tmp2) found[tmp2] = Counter(tmp1) #print tmp1, tmp2 #print e1, e2 #print tst, details return tst, details
def markup_sentence(s, modifiers, targets, prune_inactive=True): markup = pyConTextGraph.ConTextMarkup() markup.setRawText(s) markup.cleanText() markup.markItems(modifiers, mode="modifier") markup.markItems(targets, mode="target") markup.pruneMarks() markup.dropMarks('Exclusion') # apply modifiers to any targets within the modifiers scope markup.applyModifiers() markup.pruneSelfModifyingRelationships() if prune_inactive: markup.dropInactiveModifiers() return markup
def test_itemData(self): for m in self.items: reg = m.getRE() if (reg): context = pyConText.ConTextMarkup() reg = m.getLiteral() r = re.compile(reg, re.IGNORECASE | re.UNICODE) print "l: %s; r: %s" % (m.getLiteral(), m.getRE()) print bool(r.findall(m.getLiteral())), r.findall( m.getLiteral()) print "Now clean text" context.setRawText(reg) context.cleanText() print bool(r.findall(context.getText())), r.findall( context.getText()) print "_" * 42
def markup_sentence(targets, modifiers, sentence): """Markup sentence with lexical targets and lexical modifiers. Args: targets (pyConTextNLP.itemData.itemData): itemData stores a literal, category, regular expression, and rule of the targets extracted from the targets_file input. modifiers (pyConTextNLP.itemData.itemData): itemData stores a literal, category, regular expression, and rule of the modifiers extracted from the modifiers_file input. sentence (str): a string representing one sentence of a report. Returns: markup (pyConTextNLP.pyConTextGraph.ConTextMarkup): object containing sentence markups across the sentence understood as a digraph of the relationships between lexical targets and lexical modifiers. """ # Create the pyConText instance for the sentence markup = pyConText.ConTextMarkup() # Clean up and mark with modifiers and targets markup.setRawText(sentence) # Strip non alphanumeric and clean whitespace markup.cleanText() # Markup text markup.markItems(modifiers, mode="modifier") markup.markItems(targets, mode="target") # Span pruning: prune concepts that are a subset of another identified # concept (modifiers and targets are treated separately); in other words, # delete any objects that lie within the span of another object markup.pruneMarks() # Loop through the marked targets and for each target apply the modifiers markup.applyModifiers() # Modifier pruning: drop any modifiers that didn't get hooked up with a target markup.dropInactiveModifiers() return markup
def setUp(self): # create a sample image in memory self.context = pyConText.ConTextMarkup() self.splitter = helpers.sentenceSplitter() self.su1 = u'kanso <Diagnosis>**diabetes**</Diagnosis> utesl\xf6t eller diabetes men inte s\xe4kert. Vi siktar p\xe5 en r\xf6ntgenkontroll. kan det vara nej panik\xe5ngesten\n?' self.su2 = u'IMPRESSION: 1. LIMITED STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT PULMONARY EMBOLISM.' self.su3 = u'This is a sentence that does not end with a number. But this sentence ends with 1. So this should be recognized as a third sentence.' self.su4 = u'This is a sentence with a numeric value equal to 1.43 and should not be split into two parts.' self.items = [[ u"pulmonary embolism", u"PULMONARY_EMBOLISM", ur"""pulmonary\s(artery )?(embol[a-z]+)""", "" ], [ "no gross evidence of", "PROBABLE_NEGATED_EXISTENCE", "", "forward" ]] self.itemData = itemData.itemData() for i in self.items: cit = itemData.contextItem
def setUp(self): self.txt = 'There is fluid collection in the abdomen. There is no hematoma near the liver. Evaluate for abscess.' self.sentenceSpanPairs = helpers.my_sentence_splitter(self.txt) self.sentences = [x.text for x in self.sentenceSpanPairs] self.spans = [x.span for x in self.sentenceSpanPairs] self.modifiers = fc.modifiers self.targets = fc.targets self.markups = [fc.markup_sentence(x) for x in self.sentences] self.document = fc.create_context_doc(self.markups) self.empty_markup = pyConText.ConTextMarkup() self.first_markup = self.markups[0] self.second_markup = self.markups[1] self.third_markup = self.markups[2] self.first_classifier = fc.markup_conditions(markup=self.first_markup) self.second_classifier = fc.markup_conditions( markup=self.second_markup) self.third_classifier = fc.markup_conditions(markup=self.third_markup) self.classifier = fc.markup_conditions(markup=self.empty_markup)
def analyzeReport(self, report): """ given an individual radiology report, creates a pyConTextGraph object that contains the context markup report: a text string containing the radiology reports """ context = self.document targets = self.targets modifiers = self.modifiers splitter = helpers.sentenceSplitter() # alternatively you can skip the default exceptions and add your own # splitter = helpers.sentenceSpliter(useDefaults = False) #splitter.addExceptionTerms("Dr.","Mr.","Mrs.",M.D.","R.N.","L.P.N.",addCaseVariants=True) splitter.addExceptionTerms("Ms.", "D.O.", addCaseVariants=True) splitter.deleteExceptionTerms("A.B.", "B.S.", deleteCaseVariants=True) sentences = splitter.splitSentences(report) count = 0 for s in sentences: #print s markup = pyConText.ConTextMarkup() markup.toggleVerbose() markup.setRawText(s) markup.cleanText() markup.markItems(modifiers, mode="modifier") markup.markItems(targets, mode="target") #raw_input('marked targets and modifiers') #print "markup before pruning" #print markup.getXML() markup.pruneMarks() markup.dropMarks('Exclusion') # apply modifiers to any targets within the modifiers scope markup.applyModifiers() markup.pruneSelfModifyingRelationships() #context.pruneModifierRelationships() #context.dropInactiveModifiers() # add markup to context document print markup context.addMarkup(markup) count += 1 context.computeDocumentGraph()
def analyzeReport(self, report): """ given an individual radiology report, creates a pyConTextGraph object that contains the context markup report: a text string containing the radiology reports """ context = pyConText.ConTextDocument() targets = self.targets modifiers = self.modifiers splitter = helpers.sentenceSplitter() # alternatively you can skip the default exceptions and add your own # splitter = helpers.sentenceSpliter(useDefaults = False) #splitter.addExceptionTerms("Dr.","Mr.","Mrs.",M.D.","R.N.","L.P.N.",addCaseVariants=True) splitter.addExceptionTerms("Ms.", "D.O.", addCaseVariants=True) splitter.deleteExceptionTerms("A.B.", "B.S.", deleteCaseVariants=True) sentences = splitter.splitSentences(report) count = 0 for s in sentences: #print s markup = pyConText.ConTextMarkup() markup.setRawText(s) markup.cleanText() markup.markItems(modifiers, mode="modifier") markup.markItems(targets, mode="target") markup.pruneMarks() markup.dropMarks('Exclusion') markup.applyModifiers() print markup context.addMarkup(markup) self.outString += context.getXML() print context.getSectionText() #raw_input('continue') context.computeDocumentGraph(verbose=True) ag = nx.to_pydot(context.getDocumentGraph(), strict=True) ag.write("case%03d.pdf" % self.currentCase, format="pdf") #print "*"*42 #print context.getXML(currentGraph=False) #print "*"*42 raw_input('continue')
def countInFile(filename): if filename.lower().endswith(('.txt','.csv','.rtf','.doc',)): # Checking file type with open(filename) as f: linewords = (line.translate(None, punctuation).lower().split() for line in f) x = Counter(chain.from_iterable(linewords)) with open('freq' + str(rand) + '.pickle', 'wb') as handle: pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL) #pycontext logic freq1, freq2,freq3,freq4 = x.most_common(4) data = ItemData( [freq1],[freq2],[freq3],[freq4]) # grabbing just the top 4 frequent words for the itemData markup = pyConText.ConTextMarkup() markup.setRawText(filename.lower()) markup.cleanText() #markup.markItems(modifiers, mode="modifier") #print(markup.nodes(data=True)) #print(type(markup.nodes()[0])) #generating visual aid cloud = WordCloud().generate_from_frequencies(x).to_file('image.png') with open('cloud' + str(rand) + '.pickle', 'wb') as handle: pickle.dump(cloud, handle, protocol=pickle.HIGHEST_PROTOCOL) x.pop('to',None) #converting to json data = json.dumps(x) json_data = json.loads(data) for key, value in sorted(json_data.iteritems(), key=lambda (k, v): (v, k), reverse=True): k = list(key) elif filename.lower().endswith(('.docx','.odt','.pdf','.xlsx')): print('Please convert to readable file type, I.E. (.txt)') else: print('Invalid file type') return
def markup_sentence(self, sentence, prune_inactive=True): """ Identifies all markups in a sentence """ markup = pyConText.ConTextMarkup() markup.setRawText(sentence) #markup.cleanText() markup.markItems(self.modifiers, mode="modifier") markup.markItems(self.targets, mode="target") try: markup.pruneMarks() except TypeError as e: print("Error in pruneMarks") print(markup) print(e) markup.dropMarks('Exclusion') # apply modifiers to any targets within the modifiers scope markup.applyModifiers() markup.pruneSelfModifyingRelationships() if prune_inactive: markup.dropInactiveModifiers() return markup
def process(self, doc): """ Process a document with pyConText Args: document: tuple or list with document id and text to process Returns: doc_annots: list of tuples representing the document ID, sentence text, the span, and class """ # don't try to process null notes if not doc[1]: if self.verbose: print("Error segmenting doc", doc[0]) return [] # odd notes may throw an error. Just continue rather than stopping the entire process try: sentences = self.sentence_tokenizer.segToSentenceSpans(doc[1]) except KeyError: if self.verbose: print("Error segmenting doc", doc[0]) return [] #context_doc = pyConTextGraph.ConTextDocument() # ConTextDoc not needed for simple usage doc_annots = list() for sentence in sentences: # run sentence tokenizer on input text, return the spans sentence_text = doc[1][sentence.begin:sentence.end] # process every sentence by adding markup markup = pyConTextGraph.ConTextMarkup() markup.setRawText(sentence_text) markup.cleanText() # apply targets and modifiers markup.markItems(self.targets, mode="target") markup.markItems(self.modifiers, mode="modifier") # address scope of modifiers to targets, remove inactive modifiers and self-modifying relationships markup.pruneMarks() markup.applyModifiers() markup.pruneSelfModifyingRelationships() markup.dropInactiveModifiers() marked_targets = markup.getMarkedTargets() for marked_target in marked_targets: modifiers = markup.getModifiers(marked_target) if not modifiers: span = (sentence.begin + marked_target.getSpan()[0], sentence.begin + marked_target.getSpan()[1]) if self.mode == 'combined': annot = (doc[0], marked_target.getPhrase(), span[0], span[1], marked_target.getCategory()[0] + '_unspecified', marked_target.getCode()) elif self.mode == 'separate': annot = (doc[0], marked_target.getPhrase(), span[0], span[1], marked_target.getCategory()[0], 'unspecified', marked_target.getCode()) if annot not in doc_annots: doc_annots.append(annot) else: for modifier in modifiers: if marked_target.getSpan()[0] < modifier.getSpan()[0]: span = (sentence.begin + marked_target.getSpan()[0], sentence.begin + modifier.getSpan()[1]) else: span = (sentence.begin + modifier.getSpan()[0], sentence.begin + marked_target.getSpan()[1]) if self.mode == 'combined': annot = (doc[0], doc[1][span[0]:span[1]], span[0], span[1], marked_target.getCategory()[0] + '_' + modifier.getCategory()[0], marked_target.getCode()) elif self.mode == 'separate': annot = (doc[0], doc[1][span[0]:span[1]], span[0], span[1], marked_target.getCategory()[0], modifier.getCategory()[0], marked_target.getCode()) if annot not in doc_annots: doc_annots.append(annot) #context_doc.addMarkup(markup) return doc_annots
def analyzeReport( self, idName, report, modFilters=[ 'indication', 'pseudoneg', 'probable_negated_existence', 'definite_negated_existence', 'probable_existence', 'definite_existence', 'historical', 'carotid_critical', 'carotid_noncritical', 'right_sidedness', 'left_sidedness', 'bilateral_sidedness', 'sidedness', 'common_carotid_neurovascularanatomy', 'bulb_carotid_neurovascularanatomy', 'internal_carotid_neurovascularanatomy' ]): """given an individual radiology report, creates a pyConTextSql object that contains the context markup report: a text string containing the radiology reports mode: which of the pyConText objects are we using: disease modFilters: """ self.context = pyConText.ConTextDocument() targets = self.targets modifiers = self.modifiers if modFilters == None: modFilters = [ 'indication', 'pseudoneg', 'probable_negated_existence', 'definite_negated_existence', 'probable_existence', 'definite_existence', 'historical', 'carotid_critical', 'carotid_noncritical', 'right_sidedness', 'left_sidedness', 'bilateral_sidedness', 'sidedness', 'bulb_carotid_neurovascularanatomy', 'common_carotid_neurovascularanatomy', 'internal_carotid_neurovascularanatomy', ] splitter = helpers.sentenceSplitter() sentences = splitter.splitSentences(report) count = 0 print idName for s in sentences: markup = pyConText.ConTextMarkup() markup.setRawText(s) markup.cleanText() markup.markItems(modifiers, mode="modifier") markup.markItems(targets, mode="target") #markup.pruneMarks() #markup.dropMarks('Exclusion') markup.applyModifiers() #markup.pruneModifierRelationships() markup.dropInactiveModifiers() count += 1 self.context.addMarkup(markup) idName, sevFlag, htmlStr = html.mark_document_with_html( idName, self.context) #;raw_input() # fo=open(self.html_dir+"\\%s.html"%idName, "w") # fo.write(htmlStr) # fo.close() self.outString += self.context.getXML() + u"\n" print self.context.getXML() #;raw_input() return idName, sevFlag, htmlStr
def test_ConTextMarkup(): assert isinstance(pyConText.ConTextMarkup(), nx.DiGraph)