def list_conditions_with_qumls(path_to_directory_condition,
                               path_to_qumls_files):
    """
    param: path to directory where list of conditions is stored
    returns for each variable what part of the string is recognized as a biomedical concept, to which biomedical concept it is mapped, and if its fully/partly or not recognized,
    """

    term_dict = dict()
    matcher = QuickUMLS(path_to_qumls_files)
    for string in itterating_sentences(path_to_directory_condition):
        x = matcher.match(string, best_match=True, ignore_syntax=False)
        term_string = string
        if len(x) > 0:
            for y in x:
                for z in y:
                    ngram = z["ngram"]
                    term2 = z["term"]
                    term3 = z["similarity"]
                    if term_string.lower() == term2.lower():
                        term_dict[term_string] = [
                            ngram, term2, "full recognition"
                        ]
                    if term_string.lower() != term2.lower():
                        term_dict[term_string] = [
                            ngram, term2, "partial recognition", term3
                        ]

        if len(x) == 0:
            term_dict[term_string] = ["none", "none", "not recognized"]
    return term_dict
Пример #2
0
    def __init__(self, args):
        from quickumls import QuickUMLS

        assert args.quickumls_path is not None, "Please provide path where QuickUMLS is installed"
        assert args.num_worker == 1, "QuickUMLS doesn't support num_workers > 1"

        self.matcher = QuickUMLS(args.quickumls_path, 'score', threshold=0.6)
Пример #3
0
 def _start(self):
     """
     Instantiate the QuickUMLS matcher.
     """
     self._linker = QuickUMLS(self.quickumls_install,
                              overlapping_criteria=self.criterion,
                              threshold=self.min_score,
                              accepted_semtypes=self.keep_semtypes)
     self._log("Started")
Пример #4
0
    def extract(self, file_item):
        print 'quickumls_fp: ' + self.quickumls_fp
        print 'overlapping_criteria: ' + self.overlapping_criteria
        print 'threshold: ' + str(self.threshold)
        print 'similarity_name: ' + self.similarity_name
        print 'minMatchedLength: ' + str(self.minMatchedLength)
        print 'window: ' + str(self.window)

        matcher = QuickUMLS(self.quickumls_fp, self.overlapping_criteria,
                            self.threshold, self.window, self.similarity_name,
                            self.minMatchedLength, constants.ACCEPTED_SEMTYPES,
                            True)

        extraction_result = matcher.match(self.text,
                                          best_match=True,
                                          ignore_syntax=False)
        self.buildXML(extraction_result, file_item)
def run_quickumls_server(opts):
    matcher = QuickUMLS(quickumls_fp=opts.quickumls_fp,
                        threshold=opts.threshold,
                        overlapping_criteria=opts.overlapping_criteria,
                        similarity_name=opts.similarity_name,
                        window=opts.window,
                        min_match_length=opts.min_match_length,
                        verbose=opts.verbose)

    run_server(matcher, host=opts.host, port=opts.port, buffersize=4096)
Пример #6
0
 def load(
     cls,
     path_to_quickumls: str,
     accepted_semtypes: Optional[Set[str]] = None,
     threshold: float = 0.9,
     similarity_name: str = "jaccard",
     spacy_string: str = "en_core_sci_sm",
     best_match: bool = False,
     n_workers: int = 1,
 ) -> "QuickUMLSClassifier":
     if accepted_semtypes is None:
         accepted_semtypes = ALL_SEMTYPES
     q = QuickUMLS(path_to_quickumls,
                   accepted_semtypes=accepted_semtypes,
                   threshold=threshold,
                   similarity_name=similarity_name)
     # Load the spacy model, Disable the NER and Parser.
     q.nlp = spacy.load(spacy_string, disable=("ner", "parser"))
     return cls(q, n_workers)
Пример #7
0
    def load(
        cls,
        path_to_quickumls: str,
        accepted_semtypes: Optional[Set[str]] = None,
        threshold: float = 0.9,
        similarity_name: str = "jaccard",
        pooling: str = "mean",
        spacy_string: str = "en_core_sci_sm",
        priors: Optional[Dict[str, float]] = None,
        n_workers: int = 1,
    ) -> "QuickUMLSClassifier":
        """
        Load a QuickUMLSClassifier instance.

        :param path_to_quickumls: The path to a valid quickUMLS installation.
        :param accepted_semtypes: A set of accepted semantic types. If this is None, we revert to all semantic types.
        :param threshold: The threshold to accept.
        :param similarity_name: The name of the similarity function. Accepted are 'jaccard', 'overlap', 'cosine' and 'dice'.
        :param pooling: The name of the pooling function to use. Should be 'mean', 'max' or 'sum'.
        :param spacy_string: The string of the spacy model to use.
        :param priors: None or a dictionary mapping from semantic types to class probabilities.
        :param n_workers: The number of workers to use during prediction.
        :return: An initialized QuickUMLSClassifier.
        """
        # Fail early
        if pooling not in cls.FUNCS:
            raise ValueError(
                f"mode should be in {cls.FUNCS}, is now {pooling}")

        if accepted_semtypes is None:
            accepted_semtypes = ALL_SEMTYPES

        q = QuickUMLS(path_to_quickumls,
                      accepted_semtypes=accepted_semtypes,
                      threshold=threshold,
                      similarity_name=similarity_name)
        # Load the spacy model, Disable the NER and Parser.
        q.nlp = spacy.load(spacy_string, disable=("ner", "parser"))
        return cls(q, pooling, priors, n_workers)
Пример #8
0
    def process_data(pid, doc_list):
        data = []
        matcher = QuickUMLS(args.quickumls_path, 'score', threshold=0.6)
        for i, doc in enumerate(doc_list):
            qumls_res = matcher.match(doc['text'])

            res_list = ddict(list)
            for men in qumls_res:
                for cand in men:
                    start, end = cand['start'], cand['end']
                    umls_cui = cand['cui']
                    score = cand['similarity']
                    res_list[(start, end)].append((umls_cui, score))

            doc['result'] = dict(res_list)
            data.append(doc)

            if i % 10 == 0:
                print('Completed [{}] {}, {}'.format(
                    pid, i,
                    time.strftime("%d_%m_%Y") + '_' +
                    time.strftime("%H:%M:%S")))

        return data
Пример #9
0
class QuickUMLSProcessor(MERToolProcessor):
    def __init__(self, config):
        self.__quickumls = QuickUMLS('/home/daniel/QuickUMLS')
        self.__matches = None
        super().__init__(config)

    def process_input(self):
        """Extracts information from input"""
        input_file = self._input_filepath.open(encoding='utf8')
        text = input_file.read()
        print('--- QuickUMLS: Processing input ---')
        start_time = time.time()
        self.__matches = self.__quickumls.match(text,
                                                best_match=True,
                                                ignore_syntax=False)
        end_time = time.time() - start_time
        print('--- {} seconds ---'.format(end_time))

    def format_output(self):
        """Formats the original output to eHealth-KD subtask A output"""
        umls_concepts = map(lambda match_list: match_list[0],
                            self.__matches)  # Only first term (preferred term)
        ordered_concepts = sorted(
            umls_concepts,
            key=lambda umls_concept: umls_concept['start'])  # Order by start
        # Converts an UMLS concept to a eHealth-KD keyphrase
        for concept in ordered_concepts:
            keyphrase = {'label': 'Concept', 'term': concept['ngram']}
            multiword_term = concept['ngram'].split()
            if not multiword_term:
                keyphrase['span'] = '{0} {1}'.format(concept['start'],
                                                     concept['end'])
            else:
                span = []
                for token in multiword_term:
                    if not span:
                        span.append(
                            (concept['start'], concept['start'] + len(token)))
                    else:
                        span.append(
                            (span[-1][1] + 1, span[-1][1] + 1 + len(token)))
                span = map(lambda tup: '{0} {1}'.format(tup[0], tup[1]), span)
                keyphrase['span'] = ';'.join(span)
            self._key_phrases.append(keyphrase)
Пример #10
0
class QUMLS(BaseLinker):
    def __init__(self, args):
        from quickumls import QuickUMLS

        assert args.quickumls_path is not None, "Please provide path where QuickUMLS is installed"
        assert args.num_worker == 1, "QuickUMLS doesn't support num_workers > 1"

        self.matcher = QuickUMLS(args.quickumls_path, 'score', threshold=0.6)

    def __call__(self, text):
        qumls_res = self.matcher.match(text)
        men_list = ddict(list)
        for men in qumls_res:
            for cand in men:
                start, end = cand['start'], cand['end']
                umls_cui = cand['cui']
                score = cand['similarity']
                men_list[(start, end)].append([umls_cui, round(score, 3)])

        return self.reformat(men_list, text)
Пример #11
0
start = time.time()

if b == 'true':
    best_match = True
else:
    best_match = False

# directory of notes to process
directory_to_parse = '/data/data_in/'

# QuickUMLS data directory
quickumls_fp = '/data/UMLS/'
os.chdir(directory_to_parse)

matcher = QuickUMLS(quickumls_fp, o, 0.7, 5, s)
test = pd.DataFrame()
fn = pd.concat
gn = matcher.match
df = pd.DataFrame

for fname in glob.glob(directory_to_parse + '*.txt'):
    t = os.path.basename(fname)
    u = t.split('.')[0]
    with open(directory_to_parse + u + '.txt') as f:
        f1 = f.read()

        print(u)
        out = gn(f1, best_match=best_match, ignore_syntax=False)

        for i in out:
Пример #12
0
def main(args):
    print('=============')
    if args.granularity not in ['N', 'S', 'W']:
        raise TypeError(
            'Invalid value for the granularity - should be N, S, or W')

    print('Reading MIMIC-III data...')
    if args.skiplims is None:
        notes_df = read_csv(args.noteevents_fp)
    else:
        to_skip = []
        for i in range(0, len(args.skiplims), 2):
            to_skip += [
                j for j in range(args.skiplims[i], args.skiplims[i + 1])
            ]
        notes_df = read_csv(args.noteevents_fp, skiprows=to_skip)

    print('Preprocessing notes ...')
    parsed_list = []
    for note in tqdm(notes_df['TEXT']):
        note = note.lower()
        note = re.sub('[^a-zA-Z.]', ' ', note)
        note = re.sub(r'\s+', ' ', note)

        # For finer granularity than entire notes, they are tokenized so that we
        # can iterate over sentences or words
        if args.granularity != 'N':
            note = nltk.sent_tokenize(note)
            if args.granularity == 'W':
                for i in range(len(note)):
                    note[i] = re.sub('[.]', '', note[i])
                    note = [nltk.word_tokenize for sentence in note]
                    for i in range(len(note)):
                        note[i] = [
                            word for word in note_[i]
                            if word not in stopwords.words('english')
                        ]

        parsed_list.append(note)

    print('Matching with UMLS corpus...')
    # initialise QuickUMLS string matching object
    matcher = QuickUMLS(args.qumls_fp,
                        threshold=args.thresh,
                        similarity_name=args.sim)

    # useful to define these two here so the mapping loop isn't too verbose
    qumls_getter = lambda n: matcher.match(
        n, best_match=False, ignore_syntax=False)
    # this gets the maximum similarity score and its index in the list for that ngram
    simscore_getter = lambda l: max(enumerate([d['similarity'] for d in l]),
                                    key=itemgetter(1))

    ALL = args.attr == 'all'

    if ALL:
        # make a dictionary which will have the columns to be added to the dataframe
        names = ['term', 'cui', 'semtypes']
        attrs = {}
        for name in names:
            attrs[name] = []
    else:
        mapped_corpus = []
    if args.keep_similarity: similarity_scores = []

    for note in tqdm(parsed_list):
        if ALL:
            # note-level mini-version of the dictionary "attrs" to collect the attributes for each note
            sub_attr = {}
            for name in names:
                sub_attr[name] = []
        else:
            single_attr_list = []
        if args.keep_similarity: sim_list = []
        if args.granularity == 'N':
            res = qumls_getter(note)
            for l in res:
                ss = simscore_getter(l)
                if ALL:
                    for name in names:
                        sub_attr[name].append(l[ss[0]][name])
                else:
                    single_attr_list.append(l[ss[0]][args.attr])
                if args.keep_similarity: sim_list.append(ss[1])
        else:
            for s in note:
                if args.granularity != 'W':
                    res = qumls_getter(s)
                    for l in res:
                        ss = simscore_getter(l)
                        if ALL:
                            for name in names:
                                sub_attr[name].append(l[ss[0]][name])
                        else:
                            single_attr_list.append(l[ss[0]][args.attr])
                        if args.keep_similarity: sim_list.append(ss[1])
                else:
                    for w in s:
                        res = qumls_getter(w)[0]
                        ss = simscore_getter(res)
                        if ALL:
                            for name in names:
                                sub_attr[name].append(res[ss[0]][name])
                        else:
                            single_attr_list.append(res[ss[0]][args.attr])
                        if args.keep_similarity: sim_list.append(ss[1])
        if ALL:
            if args.filter_semtypes_file is not None:
                irrelevant_type_ids = [
                    i[:-1] for i in open(args.filter_semtypes_file, 'r')
                ]
                indices_to_remove = []
                for st_set in sub_attr['semtypes']:
                    if all(st in irrelevant_type_ids for st in st_set):
                        indices_to_remove.append(
                            sub_attr['semtypes'].index(st_set))
                for name in names:
                    sub_attr[name] = [
                        st for st in sub_attr[name]
                        if sub_attr[name].index(st) not in indices_to_remove
                    ]
            for name in names:
                mapped_note = ''
                for a in sub_attr[name]:
                    if name == 'semtypes':
                        for a_ in a:
                            mapped_note += a_ + ' '
                    else:
                        mapped_note += a + ' '
                attrs[name].append(mapped_note)
        else:
            mapped_note = ''
            for word in single_attr_list:
                mapped_note += word
                mapped_note += ' '
            mapped_corpus.append(mapped_note)

    print('Matching finished!')

    print('Writing .csv file...')
    if ALL:
        for name, mapped_corpus in attrs.items():
            notes_df[name.upper()] = mapped_corpus
        if args.keep_similarity: notes_df['SIM_SCORE'] = sim_list
    else:
        notes_df[args.attr.upper()] = mapped_corpus

    if args.outfilepath[-4:] != '.csv': args.outfilepath += '.csv'
    notes_df.to_csv(args.outfilepath, index=False)

    print('Done!')
    print('=============')
def run(snippets, nlp):
    resource_path = configure.RESOURCE_PATH
    sem_file = os.path.join(configure.RESOURCE_PATH, 'SemGroups.txt')
    quickUMLS_file = configure.QUICKUMLS_FILE

    # retrieve the predefined treatment semantic types
    drug_types, procedure_types, activity_types, device_types = configure.quickUMLS_config(
    )

    # get the exclude_terms
    exclude_terms = get_exclude_terms(
        os.path.join(resource_path, 'attribute_patterns.txt'),
        os.path.join(resource_path, 'relation_patterns.txt'),
        os.path.join(resource_path, 'exclude_terms.txt'))

    # get sem_map, which is the association of the semantic types and semantic groups
    sem_map = get_semtype_map(sem_file)

    # initial extraction
    print('*' * 25 + 'initial extraction' + '*' * 25)
    matcher = QuickUMLS(quickUMLS_file,
                        overlapping_criteria='score',
                        threshold=0.8,
                        accepted_semtypes=','.join([
                            drug_types, procedure_types, activity_types,
                            device_types
                        ]))

    for snippet in snippets:
        snippet['entities'] = extract_entities(snippet['processed'], matcher,
                                               exclude_terms, sem_map)
        convert_snippet(snippet)

    # remapping: expand the boundary of initially extracted treatment entities
    print('*' * 25 + 'remapping' + '*' * 25)
    file = configure.QUICKUMLS_FILE
    # the overlapping criteria is changed to 'length' prior.
    matcher = QuickUMLS(file,
                        overlapping_criteria='length',
                        threshold=0.8,
                        accepted_semtypes=','.join([
                            drug_types, procedure_types, activity_types,
                            device_types
                        ]))

    remapping_exclude_terms = get_exclude_terms(
        os.path.join(resource_path, 'attribute_patterns.txt'),
        os.path.join(resource_path, 'relation_patterns.txt'),
        os.path.join(resource_path, 'remapping_exclude_terms.txt'))
    for snippet in snippets:
        print('processing:\t' + snippet['processed'])
        if len(snippet['entities']) == 0:
            continue
        print('before expanding:')
        for entity in snippet['entities']:
            print(entity['ngram'])

        new_entities = remapping(
            snippet['entities'],
            expand_boundary(snippet['representation'], nlp,
                            remapping_exclude_terms),
            snippet['representation'], snippet['processed'], matcher, sem_map,
            exclude_terms)
        new_entities = sorted(new_entities, key=lambda x: x['start'])
        snippet['entities'] = new_entities
        print('after expanding:')
        for entity in snippet['entities']:
            print(entity['ngram'])

    # convert semtype set to list (for json)
    for snippet in snippets:
        if 'entities' in snippet.keys():
            for entity in snippet['entities']:
                entity['semtypes'] = list(entity['semtypes'])

    # convert to representation
    for snippet in snippets:
        convert_snippet(snippet)
Пример #14
0
if os.environ.get("deployment", False):
    app.config.from_pyfile('/etc/cs4300-volume-cfg/cs4300app.cfg')
else:
    app.config.from_pyfile(os.path.join(
        os.path.join(os.getcwd(), "secrets"), "cs4300app.cfg"))

gunicorn_logger = logging.getLogger('gunicorn.error')
app.logger.handlers = gunicorn_logger.handlers
app.logger.setLevel(gunicorn_logger.level)


os.system("cp -r concept_matching/quickUCSLS concept_matching/quickUCSLS_{}".format(os.getpid()))
app.logger.debug("PID: {}".format(os.getpid()))

concept_matcher = QuickUCSLS("./concept_matching/quickUCSLS_{}".format(os.getpid()), accepted_semtypes={"T{:03d}".format(i) for i in range(1,35)}, threshold=0.5, min_match_length=0)
app.logger.debug("Matcher res: {}".format(concept_matcher.match("cos sim")))
app.logger.debug("Matcher Ready")

def get_preferred_terms():
    preferred_term = dict()
    with codecs.open("./concept_matching/definition_files/MRCONSO.RRF") as f:
        for i, ln in enumerate(f):
            if i < 1:
                continue
            cui, s, _, pref = ln.strip().split("|")
            if pref == "Y":
                preferred_term[cui] = s
    return preferred_term

preferred_term = get_preferred_terms()
Пример #15
0
import os
from pymetamap import MetaMap
from quickumls import QuickUMLS
"""This is a script that inputs local text files and outputs Patient IDs and UMLS terms.

This script splits text files within the working directory into lines, removes any lines with commonly 
negated terms and then extracts UMLS information from the remaining positive lines. This script outputs Patient IDs, 
given that the name of the file is the Patient ID, and UMLS information for non-negated text lines. The default 
is to also output each line that has positive terms. This can easily be commented out if desired. 
This script is meant for use with UCSF clinical notes. Please see the README file for information on the UMLS
metathesaurus and QuickUMLS installations.
"""

## if running more than once, comment this line out. Will result in error if you try to define 'matcher' more than once.
## path should be your destination_path created during QuickUMLS installation. Change accordingly.
matcher = QuickUMLS(
    '/Users/madisonmyers/Desktop/QuickUMLS-master/destination_path')

location = os.getcwd(
)  ## Will use the directory you are working in. Make sure notes/text files are available in this folder.

for file in os.listdir(location):
    if file.endswith(".txt"):
        ## many of the UCSF clinical notes need utf-8 encoding else it will result in an error
        open_file = open(file, 'r', encoding='utf-8', errors='ignore')
        doclist = [line for line in open_file]
        docstr = ''.join(doclist)
        bn_sents = re.split(r'[.!?]', docstr)
        out = matcher.match(bn_sents, result)
        filename = file.split(".")[0].split("/")[-1]
        ## most common negated terms in clinical text
        f = [
Пример #16
0
threshold = os.environ.get('THRESHOLD', 0.7)
similarity_name = os.environ.get('SIMILARITY_NAME', "jaccard")
window = os.environ.get('WINDOW', 5)
min_match_length = os.environ.get('MIN_MATCH_LENGTH', 3)
verbose = os.environ.get('VERBOSE', False)

accepted_semtypes = os.environ.get('ACCEPTED_SEMTYPES',
                                   constants.ACCEPTED_SEMTYPES)

print(
    "quickumls_fp={}, overlapping_criteria={}, threshold={}, similarity_name={}, window={}, accepted_semtypes={}"
    .format(quickumls_fp, overlapping_criteria, threshold, similarity_name,
            window, accepted_semtypes))

matcher = QuickUMLS(quickumls_fp, overlapping_criteria, threshold, window,
                    similarity_name, min_match_length, accepted_semtypes,
                    verbose)


class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)


def process(data):
    dto = json.loads(str(data))
    text = dto['text']
    matches = matcher.match(text, best_match=True, ignore_syntax=True)
    return json.dumps(matches, cls=SetEncoder)
        type=float,
        default=0.9,
        help='Select a threshold (between 0 and 1) - default: 0.9')

    ARGStemp = parser.parse_args()
    return ARGStemp


if __name__ == "__main__":
    # Parse arguments
    global ARGS
    ARGS = parse_arguments()

    # Start the process
    global matcher
    matcher = QuickUMLS(quickumls_fp='./QuickUMLS',
                        overlapping_criteria='score',
                        threshold=ARGS.t,
                        similarity_name='cosine',
                        window=5)
    print("QuickUMLS Threshold: ", ARGS.t)
    global TUIs
    if ARGS.TUI == "Alpha" or ARGS.TUI == "alpha":
        TUIs = TUI_alpha
        print("TUI list Alpha selected")
    else:
        TUIs = TUI_beta
        print("TUI list Beta selected")
    pool = Pool(os.cpu_count() - 4)
    pool.map(main_funct, os.listdir(dirchunks))
Пример #18
0
    Return pkl files of the dataframes with two added columns containing the CUIs identified by QuickUMLS (directory is specified by the '--path_out' parameter).
    """

    ############## PARAMETERS ##############
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--path_in', default='../data/bern_df/')
    argparser.add_argument('--path_out', default='../data/cuis/')
    args = argparser.parse_args()

    ############## INSTANTIATE QuickUMLS ##############
    sem_diseases = ['T020', 'T190', 'T049', 'T019', 'T047', 'T050', 'T033', 'T037', 'T048', 'T191', 'T046', 'T184']
    sem_drugs = ['T116', 'T195', 'T123', 'T122', 'T103', 'T120', 'T104', 'T200', 'T196', 'T126', 'T131', 'T125', 'T129', 'T130', 'T197', 'T114', 'T109', 'T121', 'T192', 'T127']
    sem_dis_drug = sem_diseases + sem_drugs
    data_dir = '../data/quickUMLS_eng'

    matcher = QuickUMLS(quickumls_fp=data_dir, accepted_semtypes=sem_dis_drug)

    ############## PROCESS ##############
    path_in = Path(args.path_in)
    path_out = Path(args.path_out)
    path_out.mkdir(exist_ok=True, parents=True)

    for file in path_in.glob('*.pkl'):
        batch = process_batch(file, 'idx')
        batch_name = file.stem
        batch['disease_cuis'] = batch['ent_text_disease'].apply(apply_QuickUMLS, args=(matcher,))
        batch['drug_cuis'] = batch['ent_text_drug'].apply(apply_QuickUMLS, args=(matcher,))
        batch.to_pickle(f"{path_out}/{batch_name}.pkl")
        print(f"{batch_name} is processed and saved.")
Пример #19
0
 def __init__(self, config):
     self.__quickumls = QuickUMLS('/home/daniel/QuickUMLS')
     self.__matches = None
     super().__init__(config)
Пример #20
0
    return ann_by_text_id


def format_annotation(ann):
    outann = {
        "start": ann["start"],
        "end": ann["end"],
        "matched_text": ann["ngram"],
        "semtypes": list(ann["semtypes"]),
        "cui": ann["cui"]
    }
    return outann


if __name__ == "__main__":
    args = parse_args()
    if os.path.exists(args.outfile):
        print("Output file already exists. Please delete it and rerun.")
        print("Aborting.")
        sys.exit(1)
    docs = json.load(open(args.documents, 'r'))
    conf = json.load(open(args.quickumls_conf))
    print("========================")
    print("QuickUMLS configuration:")
    print("QuickUMLS installation: {args.quickumls_install_dir}")
    print(json.dumps(conf, indent=2))
    print("========================")
    matcher = QuickUMLS(args.quickumls_install_dir, **conf)
    anns = run_quickumls(docs, matcher)
    json.dump(anns, open(args.outfile, 'w'))
    return list(set(similarWords))


pattern = "(?:^(?:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint)$)|n't"
endSymbolsTillNegation = [',', '.', ':', ';', '!', '?']

stopWords = stopwords.words('english')
print(stopWords)

removeSymbolsList = [
    '∆', '(', ')', ',', '.', 'β', 'α', "'s'", '$', '``', "''", "'s", ':', ';',
    '/', '\\', '+'
]

matcher = QuickUMLS(
    '/home/roysoumya/Documents/ClinicalTrials_Coding/QuickUMLS/QuickUMLS_data/'
)

lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

input_query_path = "/mnt/c/Users/roysoumya/Documents/ClinicalTrials_Coding/COCTR_multidimensional_ranking-master/datasetPreparation/src/ExtendedRetrievalCodes/data/extended_retr_pagerank/"
output_path = "/mnt/c/Users/roysoumya/Documents/ClinicalTrials_Coding/COCTR_multidimensional_ranking-master/datasetPreparation/src/ExtendedRetrievalCodes/data/appendSynsetMatch_10thSept/"

files = sorted(listdir(input_query_path))

# In[12]:


def getListOfWordsForWhichUMLSConceptdidntGen(query):
    final_query = query
Пример #22
0
print('Creating QuickUMLS object...')

quickumls_path = r'C:\quickumls\SNOMED_RXNORM_CPT_lowercase'

total_iterations = 1
ignore_syntax = False
threshold = 0.7
accepted_semtypes = None
# accepted_semtypes = constants.ACCEPTED_SEMTYPES

print('Setting up for semtypes (None means all types) : {}'.format(
    accepted_semtypes))

matcher = QuickUMLS(quickumls_path,
                    accepted_semtypes=accepted_semtypes,
                    threshold=threshold)

print('QuickUMLS object created...')

text_file_path = r'data/colonoscopy-1.txt'
file = open(text_file_path, 'r')
text = file.read()
file.close()

print('Length of Text : {0} characters'.format(len(text)))

print('About to reprocess this text [{0}] times'.format(total_iterations))

results_list = []
result_count = 0
Пример #23
0
class QuickUMLSDriver(EntityLinker):
    def __init__(self,
                 name="quickumls",
                 quickumls_install="",
                 criterion="score",
                 min_score=0.7,
                 keep_semtypes=None):
        """
        Interface to QuickUMLS.

        :param str quickumls_install: The path to the QuickUMLS installation.
        :param float min_score: Minimum score to consider, between 0 and 1.0.
        :param list keep_semtypes: List of semantic types to consider.
        """
        super().__init__(name)
        self.quickumls_install = quickumls_install
        self.criterion = criterion
        self.min_score = min_score
        self.keep_semtypes = keep_semtypes
        self._log_parameters()
        self._start()

    def _log_parameters(self):
        self._log(f"Staring annotator '{self.name}'")
        self._log(f"{self.name} parameters:")
        self._log(f"  quickumls_install : {self.quickumls_install}")
        self._log(f"  criterion : {self.criterion}")
        self._log(f"  min_score : {self.min_score}")
        self._log(f"  keep_semtypes : {self.keep_semtypes}")

    def _start(self):
        """
        Instantiate the QuickUMLS matcher.
        """
        self._linker = QuickUMLS(self.quickumls_install,
                                 overlapping_criteria=self.criterion,
                                 threshold=self.min_score,
                                 accepted_semtypes=self.keep_semtypes)
        self._log("Started")

    def _convert_output_to_candidate_links(self, outputs):
        """
        Convert the raw QuickUMLS output into CandidateLink
        instances. Output is of the format:

        {matched_string: [CandidateLink, [...]]}

        :param list outputs: List of outputs from QuickUMLS.match().
    def __init__(self, input_string, candidate_term,
                 candidate_source, candidate_id, **attrs):
        """
        links = defaultdict(list)
        for phrase in outputs:
            seen_cuis = set()
            for match in phrase:
                try:
                    candidate_term = match["preferred_term"]
                    if candidate_term == "":  # No preferred_term found.
                        candidate_term = match["term"]
                except KeyError:
                    candidate_term = match["term"]
                # QuickUMLS sometimes returns the same CUI more than once.
                if match["cui"] in seen_cuis:
                    continue
                else:
                    seen_cuis.add(match["cui"])
                candidate = CandidateLink(
                    input_string=match["ngram"],
                    candidate_term=candidate_term,
                    candidate_source="UMLS",
                    candidate_id=match["cui"],
                    linking_score=match["similarity"],
                    # attrs
                    umls_semantic_type=match["semtypes"])
                links[match["ngram"]].append(candidate)
        return links

    def link(self, queries):
        """
        Link query or list of queries to entities in the
        corresponding database. Input should be a
        sequence of (ID, text pairs). Outputs a nested
        dictionary of the format

            {input_id: {matched_input: [CandidateLink, [...]]}}.

        :param list input_strings: List of (ID, string) pairs to link.
        :returns: Dictionary of input strings to CandidateLink instances.
        :rtype: dict
        """
        queries = self._prepare_queries(queries, ascii_only=False)
        all_links = {}
        for (qid, query) in queries:
            output = self._linker.match(query)
            links = self._convert_output_to_candidate_links(output)
            all_links[qid] = links
        return all_links

    def get_best_links(self, candidate_links, keep_top_n):
        """
        Given a set of candidate links for a set of input
        strings returned by EntityLinker.link(), choose
        the N "best" linkings for each input string from among
        the candidate links.

            {input_id: {matched_input: [CandidateLink, [...]]}}

        :param dict candidate_links: Dictionary of input strings
                                     to candidate linkings.
        :returns: candidate_links filtered to include only the N "best" links.
        :rtype: list
        """
        for qid in candidate_links.keys():
            for (matched_str, candidates) in candidate_links[qid].items():
                for c in candidates:
                    if matched_str.lower() == c.candidate_term.lower():
                        c.linking_score = 1.0
                candidates_sorted = sorted(candidates,
                                           key=lambda x: x.linking_score,
                                           reverse=True)
                candidates_top_n = candidates_sorted[:keep_top_n]
                candidate_links[qid][matched_str] = candidates_top_n
        return candidate_links
Пример #24
0
try:
    from quickumls import QuickUMLS
except ImportError:
    from .quickumls import QuickUMLS

print('Creating QuickUMLS object...')

quickumls_path = r'C:\quickumls'

matcher = QuickUMLS(quickumls_path)

print('QuickUMLS object created...')

text = "The ulna has dislocated posteriorly from the trochlea of the humerus."

print('*************************')
print('Text:')
print(text)
print('*************************')

res = matcher.match(text, best_match=True, ignore_syntax=False)

print('Matching results:')
print(res)
Пример #25
0
import pandas as pd
from collections import defaultdict
from quickumls import QuickUMLS

to_annot_data = pd.read_csv('toAnnotateWithText_9thSept.csv')
matcher = QuickUMLS(
    '/home/roysoumya/Documents/ClinicalTrials_Coding/QuickUMLS/QuickUMLS_data/'
)

brief_title_concepts_list = list()
brief_summ_concepts_list = list()

for row_id in range(to_annot_data.shape[0]):
    brief_title = to_annot_data.iloc[row_id, 2]
    brief_summ = to_annot_data.iloc[row_id, 3]

    brief_title_umls = matcher.match(brief_title,
                                     best_match=True,
                                     ignore_syntax=False)
    brief_title_concepts_list.append(';'.join(
        [elem[0][u'cui'] for elem in brief_title_umls]))

    brief_summ_umls = matcher.match(brief_summ,
                                    best_match=True,
                                    ignore_syntax=False)
    brief_summ_concepts_list.append(';'.join(
        [elem[0][u'cui'] for elem in brief_summ_umls]))

    if row_id % 50 == 0:
        print(row_id)
Пример #26
0
import os
import re

import numpy as np
from quickumls import QuickUMLS

QUICKUMLS_FP = os.path.expanduser('~/quickumls_data/')
matcher = QuickUMLS(QUICKUMLS_FP,
                    threshold=0.7,
                    similarity_name='jaccard',
                    window=5)


def parse_chunker(original_text, phrase_matches):
    order = np.argsort([match[0]['start'] for match in phrase_matches])

    offset = 0
    chunked_string = original_text
    prev_end = 0
    for num_match, match_idx in enumerate(order):
        match = phrase_matches[match_idx]
        ngram = match[0]['ngram']
        term = match[0]['term']
        start = match[0]['start']
        end = match[0]['end']
        assert start >= prev_end
        prev_end = end
        # Only change multi word phrases
        if len(ngram.split()) == 1:
            continue
        term = '_'.join(term.split())
from quickumls import QuickUMLS
import csv, os, sys, time, re

if __name__ == "__main__":
    # start_time = time.time()

    THRESHOLD = 0.7
    matcher = QuickUMLS(quickumls_fp='./QuickUMLS',
                        overlapping_criteria='score',
                        threshold=THRESHOLD,
                        similarity_name='cosine',
                        window=5)
    # myDict = {}
    dirchunks = "./data/chunkssmall/"
    diroutputchunks = "./data/outputchunkssmall/"
    # list_cui = []
    # list_terms = []
    for file in os.listdir(dirchunks):
        filename = dirchunks + file
        # liste_concepts = []
        lineNb = 1
        list_cui = []
        list_terms = []
        with open(filename, 'r') as fd:
            print("File",
                  filename,
                  "opened! \nNow treating line: ",
                  flush=True)
            # Preparing outputfile
            outputFile = diroutputchunks + file + ".output"
            fw = open(outputFile, 'w')