def list_conditions_with_qumls(path_to_directory_condition, path_to_qumls_files): """ param: path to directory where list of conditions is stored returns for each variable what part of the string is recognized as a biomedical concept, to which biomedical concept it is mapped, and if its fully/partly or not recognized, """ term_dict = dict() matcher = QuickUMLS(path_to_qumls_files) for string in itterating_sentences(path_to_directory_condition): x = matcher.match(string, best_match=True, ignore_syntax=False) term_string = string if len(x) > 0: for y in x: for z in y: ngram = z["ngram"] term2 = z["term"] term3 = z["similarity"] if term_string.lower() == term2.lower(): term_dict[term_string] = [ ngram, term2, "full recognition" ] if term_string.lower() != term2.lower(): term_dict[term_string] = [ ngram, term2, "partial recognition", term3 ] if len(x) == 0: term_dict[term_string] = ["none", "none", "not recognized"] return term_dict
def extract(self, file_item): print 'quickumls_fp: ' + self.quickumls_fp print 'overlapping_criteria: ' + self.overlapping_criteria print 'threshold: ' + str(self.threshold) print 'similarity_name: ' + self.similarity_name print 'minMatchedLength: ' + str(self.minMatchedLength) print 'window: ' + str(self.window) matcher = QuickUMLS(self.quickumls_fp, self.overlapping_criteria, self.threshold, self.window, self.similarity_name, self.minMatchedLength, constants.ACCEPTED_SEMTYPES, True) extraction_result = matcher.match(self.text, best_match=True, ignore_syntax=False) self.buildXML(extraction_result, file_item)
class QuickUMLSProcessor(MERToolProcessor): def __init__(self, config): self.__quickumls = QuickUMLS('/home/daniel/QuickUMLS') self.__matches = None super().__init__(config) def process_input(self): """Extracts information from input""" input_file = self._input_filepath.open(encoding='utf8') text = input_file.read() print('--- QuickUMLS: Processing input ---') start_time = time.time() self.__matches = self.__quickumls.match(text, best_match=True, ignore_syntax=False) end_time = time.time() - start_time print('--- {} seconds ---'.format(end_time)) def format_output(self): """Formats the original output to eHealth-KD subtask A output""" umls_concepts = map(lambda match_list: match_list[0], self.__matches) # Only first term (preferred term) ordered_concepts = sorted( umls_concepts, key=lambda umls_concept: umls_concept['start']) # Order by start # Converts an UMLS concept to a eHealth-KD keyphrase for concept in ordered_concepts: keyphrase = {'label': 'Concept', 'term': concept['ngram']} multiword_term = concept['ngram'].split() if not multiword_term: keyphrase['span'] = '{0} {1}'.format(concept['start'], concept['end']) else: span = [] for token in multiword_term: if not span: span.append( (concept['start'], concept['start'] + len(token))) else: span.append( (span[-1][1] + 1, span[-1][1] + 1 + len(token))) span = map(lambda tup: '{0} {1}'.format(tup[0], tup[1]), span) keyphrase['span'] = ';'.join(span) self._key_phrases.append(keyphrase)
class QUMLS(BaseLinker): def __init__(self, args): from quickumls import QuickUMLS assert args.quickumls_path is not None, "Please provide path where QuickUMLS is installed" assert args.num_worker == 1, "QuickUMLS doesn't support num_workers > 1" self.matcher = QuickUMLS(args.quickumls_path, 'score', threshold=0.6) def __call__(self, text): qumls_res = self.matcher.match(text) men_list = ddict(list) for men in qumls_res: for cand in men: start, end = cand['start'], cand['end'] umls_cui = cand['cui'] score = cand['similarity'] men_list[(start, end)].append([umls_cui, round(score, 3)]) return self.reformat(men_list, text)
def process_data(pid, doc_list): data = [] matcher = QuickUMLS(args.quickumls_path, 'score', threshold=0.6) for i, doc in enumerate(doc_list): qumls_res = matcher.match(doc['text']) res_list = ddict(list) for men in qumls_res: for cand in men: start, end = cand['start'], cand['end'] umls_cui = cand['cui'] score = cand['similarity'] res_list[(start, end)].append((umls_cui, score)) doc['result'] = dict(res_list) data.append(doc) if i % 10 == 0: print('Completed [{}] {}, {}'.format( pid, i, time.strftime("%d_%m_%Y") + '_' + time.strftime("%H:%M:%S"))) return data
try: from quickumls import QuickUMLS except ImportError: from .quickumls import QuickUMLS print('Creating QuickUMLS object...') quickumls_path = r'C:\quickumls' matcher = QuickUMLS(quickumls_path) print('QuickUMLS object created...') text = "The ulna has dislocated posteriorly from the trochlea of the humerus." print('*************************') print('Text:') print(text) print('*************************') res = matcher.match(text, best_match=True, ignore_syntax=False) print('Matching results:') print(res)
## if running more than once, comment this line out. Will result in error if you try to define 'matcher' more than once. ## path should be your destination_path created during QuickUMLS installation. Change accordingly. matcher = QuickUMLS( '/Users/madisonmyers/Desktop/QuickUMLS-master/destination_path') location = os.getcwd( ) ## Will use the directory you are working in. Make sure notes/text files are available in this folder. for file in os.listdir(location): if file.endswith(".txt"): ## many of the UCSF clinical notes need utf-8 encoding else it will result in an error open_file = open(file, 'r', encoding='utf-8', errors='ignore') doclist = [line for line in open_file] docstr = ''.join(doclist) bn_sents = re.split(r'[.!?]', docstr) out = matcher.match(bn_sents, result) filename = file.split(".")[0].split("/")[-1] ## most common negated terms in clinical text f = [ "not", "no", "denies", "without", "no evidence", "with no", "negative for" ] ## add filename for any text file you don't want to input. Common examples are below. # if filename == "requirements": # continue # if filename == "LICENSE": # continue for line in bn_sents: if any(i.lower() in line.lower() for i in f): continue else:
results_list = [] result_count = 0 start_time = time.time() output_dir = 'output/performance_test' for i in range(total_iterations): if i % 100 == 0: print('Progress : [{0}/{1}]'.format(i, total_iterations)) filename = '{0}.csv'.format(i) f = open(os.path.join(output_dir, filename), 'w') match_results = matcher.match(text, best_match=True, ignore_syntax=ignore_syntax) results_list.append(match_results) result_count += len(match_results) header = 'text,start,end,CUI,term,similarity\n' f.write(header) # this is a list of lists for match_result in match_results: # each match may contain multiple ngram entries for ngram_match_dict in match_result: #print(ngram_match_dict) line = '"{0}",{1},{2},{3},"{4}",{5:.2f}\n'.format( ngram_match_dict['ngram'], ngram_match_dict['start'], ngram_match_dict['end'], ngram_match_dict['cui'],
from quickumls import QuickUMLS to_annot_data = pd.read_csv('toAnnotateWithText_9thSept.csv') matcher = QuickUMLS( '/home/roysoumya/Documents/ClinicalTrials_Coding/QuickUMLS/QuickUMLS_data/' ) brief_title_concepts_list = list() brief_summ_concepts_list = list() for row_id in range(to_annot_data.shape[0]): brief_title = to_annot_data.iloc[row_id, 2] brief_summ = to_annot_data.iloc[row_id, 3] brief_title_umls = matcher.match(brief_title, best_match=True, ignore_syntax=False) brief_title_concepts_list.append(';'.join( [elem[0][u'cui'] for elem in brief_title_umls])) brief_summ_umls = matcher.match(brief_summ, best_match=True, ignore_syntax=False) brief_summ_concepts_list.append(';'.join( [elem[0][u'cui'] for elem in brief_summ_umls])) if row_id % 50 == 0: print(row_id) print('Number of brief title elements: ', len(brief_title_concepts_list)) print('Number of brief summary elements: ', len(brief_summ_concepts_list))
class QuickUMLSDriver(EntityLinker): def __init__(self, name="quickumls", quickumls_install="", criterion="score", min_score=0.7, keep_semtypes=None): """ Interface to QuickUMLS. :param str quickumls_install: The path to the QuickUMLS installation. :param float min_score: Minimum score to consider, between 0 and 1.0. :param list keep_semtypes: List of semantic types to consider. """ super().__init__(name) self.quickumls_install = quickumls_install self.criterion = criterion self.min_score = min_score self.keep_semtypes = keep_semtypes self._log_parameters() self._start() def _log_parameters(self): self._log(f"Staring annotator '{self.name}'") self._log(f"{self.name} parameters:") self._log(f" quickumls_install : {self.quickumls_install}") self._log(f" criterion : {self.criterion}") self._log(f" min_score : {self.min_score}") self._log(f" keep_semtypes : {self.keep_semtypes}") def _start(self): """ Instantiate the QuickUMLS matcher. """ self._linker = QuickUMLS(self.quickumls_install, overlapping_criteria=self.criterion, threshold=self.min_score, accepted_semtypes=self.keep_semtypes) self._log("Started") def _convert_output_to_candidate_links(self, outputs): """ Convert the raw QuickUMLS output into CandidateLink instances. Output is of the format: {matched_string: [CandidateLink, [...]]} :param list outputs: List of outputs from QuickUMLS.match(). def __init__(self, input_string, candidate_term, candidate_source, candidate_id, **attrs): """ links = defaultdict(list) for phrase in outputs: seen_cuis = set() for match in phrase: try: candidate_term = match["preferred_term"] if candidate_term == "": # No preferred_term found. candidate_term = match["term"] except KeyError: candidate_term = match["term"] # QuickUMLS sometimes returns the same CUI more than once. if match["cui"] in seen_cuis: continue else: seen_cuis.add(match["cui"]) candidate = CandidateLink( input_string=match["ngram"], candidate_term=candidate_term, candidate_source="UMLS", candidate_id=match["cui"], linking_score=match["similarity"], # attrs umls_semantic_type=match["semtypes"]) links[match["ngram"]].append(candidate) return links def link(self, queries): """ Link query or list of queries to entities in the corresponding database. Input should be a sequence of (ID, text pairs). Outputs a nested dictionary of the format {input_id: {matched_input: [CandidateLink, [...]]}}. :param list input_strings: List of (ID, string) pairs to link. :returns: Dictionary of input strings to CandidateLink instances. :rtype: dict """ queries = self._prepare_queries(queries, ascii_only=False) all_links = {} for (qid, query) in queries: output = self._linker.match(query) links = self._convert_output_to_candidate_links(output) all_links[qid] = links return all_links def get_best_links(self, candidate_links, keep_top_n): """ Given a set of candidate links for a set of input strings returned by EntityLinker.link(), choose the N "best" linkings for each input string from among the candidate links. {input_id: {matched_input: [CandidateLink, [...]]}} :param dict candidate_links: Dictionary of input strings to candidate linkings. :returns: candidate_links filtered to include only the N "best" links. :rtype: list """ for qid in candidate_links.keys(): for (matched_str, candidates) in candidate_links[qid].items(): for c in candidates: if matched_str.lower() == c.candidate_term.lower(): c.linking_score = 1.0 candidates_sorted = sorted(candidates, key=lambda x: x.linking_score, reverse=True) candidates_top_n = candidates_sorted[:keep_top_n] candidate_links[qid][matched_str] = candidates_top_n return candidate_links
def main(args): print('=============') if args.granularity not in ['N', 'S', 'W']: raise TypeError( 'Invalid value for the granularity - should be N, S, or W') print('Reading MIMIC-III data...') if args.skiplims is None: notes_df = read_csv(args.noteevents_fp) else: to_skip = [] for i in range(0, len(args.skiplims), 2): to_skip += [ j for j in range(args.skiplims[i], args.skiplims[i + 1]) ] notes_df = read_csv(args.noteevents_fp, skiprows=to_skip) print('Preprocessing notes ...') parsed_list = [] for note in tqdm(notes_df['TEXT']): note = note.lower() note = re.sub('[^a-zA-Z.]', ' ', note) note = re.sub(r'\s+', ' ', note) # For finer granularity than entire notes, they are tokenized so that we # can iterate over sentences or words if args.granularity != 'N': note = nltk.sent_tokenize(note) if args.granularity == 'W': for i in range(len(note)): note[i] = re.sub('[.]', '', note[i]) note = [nltk.word_tokenize for sentence in note] for i in range(len(note)): note[i] = [ word for word in note_[i] if word not in stopwords.words('english') ] parsed_list.append(note) print('Matching with UMLS corpus...') # initialise QuickUMLS string matching object matcher = QuickUMLS(args.qumls_fp, threshold=args.thresh, similarity_name=args.sim) # useful to define these two here so the mapping loop isn't too verbose qumls_getter = lambda n: matcher.match( n, best_match=False, ignore_syntax=False) # this gets the maximum similarity score and its index in the list for that ngram simscore_getter = lambda l: max(enumerate([d['similarity'] for d in l]), key=itemgetter(1)) ALL = args.attr == 'all' if ALL: # make a dictionary which will have the columns to be added to the dataframe names = ['term', 'cui', 'semtypes'] attrs = {} for name in names: attrs[name] = [] else: mapped_corpus = [] if args.keep_similarity: similarity_scores = [] for note in tqdm(parsed_list): if ALL: # note-level mini-version of the dictionary "attrs" to collect the attributes for each note sub_attr = {} for name in names: sub_attr[name] = [] else: single_attr_list = [] if args.keep_similarity: sim_list = [] if args.granularity == 'N': res = qumls_getter(note) for l in res: ss = simscore_getter(l) if ALL: for name in names: sub_attr[name].append(l[ss[0]][name]) else: single_attr_list.append(l[ss[0]][args.attr]) if args.keep_similarity: sim_list.append(ss[1]) else: for s in note: if args.granularity != 'W': res = qumls_getter(s) for l in res: ss = simscore_getter(l) if ALL: for name in names: sub_attr[name].append(l[ss[0]][name]) else: single_attr_list.append(l[ss[0]][args.attr]) if args.keep_similarity: sim_list.append(ss[1]) else: for w in s: res = qumls_getter(w)[0] ss = simscore_getter(res) if ALL: for name in names: sub_attr[name].append(res[ss[0]][name]) else: single_attr_list.append(res[ss[0]][args.attr]) if args.keep_similarity: sim_list.append(ss[1]) if ALL: if args.filter_semtypes_file is not None: irrelevant_type_ids = [ i[:-1] for i in open(args.filter_semtypes_file, 'r') ] indices_to_remove = [] for st_set in sub_attr['semtypes']: if all(st in irrelevant_type_ids for st in st_set): indices_to_remove.append( sub_attr['semtypes'].index(st_set)) for name in names: sub_attr[name] = [ st for st in sub_attr[name] if sub_attr[name].index(st) not in indices_to_remove ] for name in names: mapped_note = '' for a in sub_attr[name]: if name == 'semtypes': for a_ in a: mapped_note += a_ + ' ' else: mapped_note += a + ' ' attrs[name].append(mapped_note) else: mapped_note = '' for word in single_attr_list: mapped_note += word mapped_note += ' ' mapped_corpus.append(mapped_note) print('Matching finished!') print('Writing .csv file...') if ALL: for name, mapped_corpus in attrs.items(): notes_df[name.upper()] = mapped_corpus if args.keep_similarity: notes_df['SIM_SCORE'] = sim_list else: notes_df[args.attr.upper()] = mapped_corpus if args.outfilepath[-4:] != '.csv': args.outfilepath += '.csv' notes_df.to_csv(args.outfilepath, index=False) print('Done!') print('=============')
if os.environ.get("deployment", False): app.config.from_pyfile('/etc/cs4300-volume-cfg/cs4300app.cfg') else: app.config.from_pyfile(os.path.join( os.path.join(os.getcwd(), "secrets"), "cs4300app.cfg")) gunicorn_logger = logging.getLogger('gunicorn.error') app.logger.handlers = gunicorn_logger.handlers app.logger.setLevel(gunicorn_logger.level) os.system("cp -r concept_matching/quickUCSLS concept_matching/quickUCSLS_{}".format(os.getpid())) app.logger.debug("PID: {}".format(os.getpid())) concept_matcher = QuickUCSLS("./concept_matching/quickUCSLS_{}".format(os.getpid()), accepted_semtypes={"T{:03d}".format(i) for i in range(1,35)}, threshold=0.5, min_match_length=0) app.logger.debug("Matcher res: {}".format(concept_matcher.match("cos sim"))) app.logger.debug("Matcher Ready") def get_preferred_terms(): preferred_term = dict() with codecs.open("./concept_matching/definition_files/MRCONSO.RRF") as f: for i, ln in enumerate(f): if i < 1: continue cui, s, _, pref = ln.strip().split("|") if pref == "Y": preferred_term[cui] = s return preferred_term preferred_term = get_preferred_terms()
count_quote = line.count('"') if count_comma >= 1: # New clinical note list_cui = [] list_terms = [] fw.write(line) continue print(lineNb, flush=True) # if line not in myDict.keys(): # matches = matcher.match(line, best_match=True, ignore_syntax=False) # print(matches) # myDict[line] = matches # else: # matches = myDict[line] matches = matcher.match(line, best_match=True, ignore_syntax=False) concepts_output = [] for phrase_candidate in matches: # Find max max = 0 # print("PC :", phrase_candidate) for candidate in phrase_candidate: if candidate['similarity'] > max: max = candidate['similarity'] # Get preferred terms for that max list_to_write = [] for candidate in phrase_candidate: if candidate['similarity'] == max: # print("C : ", candidate) if candidate['term'] not in list_terms: