Exemplo n.º 1
0
def processOneFile_fda(fileName, annotation_dir, nlp_tool, isTraining, types,
                       type_filter, isFDA2018, isNorm):
    documents = []
    annotation_file = get_fda_file(join(annotation_dir, fileName))

    # each section is a document
    for section in annotation_file.sections:
        document = Document()
        document.name = fileName[:fileName.find('.')] + "_" + section.id
        if section.text is None:
            document.text = ""
            document.entities = []
            document.sentences = []
            documents.append(document)
            continue

        document.text = section.text

        entities = []

        if isFDA2018 == False and isNorm == True:
            for reaction in annotation_file.reactions:
                entity = Entity()
                entity.name = reaction.name
                for normalization in reaction.normalizations:
                    entity.norm_ids.append(
                        normalization.meddra_pt_id)  # can be none
                    entity.norm_names.append(normalization.meddra_pt)
                entities.append(entity)

        else:
            for entity in annotation_file.mentions:
                if entity.section != section.id:
                    continue
                if types and (entity.type not in type_filter):
                    continue
                entities.append(entity)

        document.entities = entities

        if opt.nlp_tool == "nltk":
            if isTraining:
                sentences = get_sentences_and_tokens_from_nltk(
                    section.text, nlp_tool, document.entities,
                    annotation_file.ignore_regions, section.id)
            else:
                sentences = get_sentences_and_tokens_from_nltk(
                    section.text, nlp_tool, None,
                    annotation_file.ignore_regions, section.id)
        else:
            raise RuntimeError("invalid nlp tool")

        document.sentences = sentences

        documents.append(document)

    return documents, annotation_file
Exemplo n.º 2
0
def processOneFile_fda(fileName, annotation_dir, types, type_filter, isFDA2018,
                       isNorm):
    documents = []
    annotation_file = get_fda_file(os.path.join(annotation_dir, fileName))

    # each section is a document
    for section in annotation_file.sections:
        document = Document()
        document.name = fileName[:fileName.find('.')] + "_" + section.id
        if section.text is None:
            document.text = ""
            document.entities = []
            document.sentences = []
            documents.append(document)
            continue

        document.text = section.text

        entities = []

        if isFDA2018 == False and isNorm == True:
            for reaction in annotation_file.reactions:
                entity = Entity()
                entity.name = reaction.name
                for normalization in reaction.normalizations:
                    entity.norm_ids.append(
                        normalization.meddra_pt_id)  # can be none
                    entity.norm_names.append(normalization.meddra_pt)
                entities.append(entity)

        else:
            for entity in annotation_file.mentions:
                if entity.section != section.id:
                    continue
                if types and (entity.type not in type_filter):
                    continue
                entities.append(entity)

        document.entities = entities

        document.sentences = []

        documents.append(document)

    return documents, annotation_file
Exemplo n.º 3
0
def parse_one_gold_file(annotation_dir, corpus_dir, fileName):
    document = Document()
    document.name = fileName[:fileName.find('.')]

    annotation_file = get_bioc_file(os.path.join(annotation_dir, fileName))
    bioc_passage = annotation_file[0].passages[0]
    entities = []

    for entity in bioc_passage.annotations:
        if entity.infons['type'] not in type_we_care:
            continue

        entity_ = Entity()
        entity_.id = entity.id
        processed_name = entity.text.replace('\\n', ' ')
        if len(processed_name) == 0:
            logging.debug("{}: entity {} name is empty".format(
                fileName, entity.id))
            continue
        entity_.name = processed_name
        entity_.type = entity.infons['type']
        entity_.spans.append(
            [entity.locations[0].offset, entity.locations[0].end])

        if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A') \
                and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'):
            entity_.norm_ids.append(entity.infons['SNOMED code'])
            entity_.norm_names.append(entity.infons['SNOMED term'])

        elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A') \
                and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'):
            entity_.norm_ids.append(entity.infons['MedDRA code'])
            entity_.norm_names.append(entity.infons['MedDRA term'])
        else:
            logging.debug("{}: no norm id in entity {}".format(
                fileName, entity.id))
            # some entities may have no norm id
            continue

        entities.append(entity_)

    document.entities = entities

    corpus_file = get_text_file(
        os.path.join(corpus_dir,
                     fileName.split('.bioc')[0]))
    document.text = corpus_file

    return document
Exemplo n.º 4
0
def processOneFile(fileName, annotation_dir, corpus_dir, nlp_tool, isTraining,
                   types, type_filter):
    document = Document()
    document.name = fileName[:fileName.find('.')]

    ct_snomed = 0
    ct_meddra = 0
    ct_unnormed = 0

    if annotation_dir:
        annotation_file = get_bioc_file(join(annotation_dir, fileName))
        bioc_passage = annotation_file[0].passages[0]
        entities = []

        for entity in bioc_passage.annotations:
            if types and (entity.infons['type'] not in type_filter):
                continue
            entity_ = Entity()
            entity_.id = entity.id
            processed_name = entity.text.replace('\\n', ' ')
            if len(processed_name) == 0:
                logging.debug("{}: entity {} name is empty".format(
                    fileName, entity.id))
                continue
            entity_.name = processed_name

            entity_.type = entity.infons['type']
            entity_.spans.append(
                [entity.locations[0].offset, entity.locations[0].end])
            if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A')\
                    and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'):
                entity_.norm_ids.append(entity.infons['SNOMED code'])
                entity_.norm_names.append(entity.infons['SNOMED term'])
                ct_snomed += 1
            elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A')\
                    and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'):
                entity_.norm_ids.append(entity.infons['MedDRA code'])
                entity_.norm_names.append(entity.infons['MedDRA term'])
                ct_meddra += 1
            else:
                logging.debug("{}: no norm id in entity {}".format(
                    fileName, entity.id))
                ct_unnormed += 1
                continue

            entities.append(entity_)

        document.entities = entities

    corpus_file = get_text_file(join(corpus_dir, fileName.split('.bioc')[0]))
    document.text = corpus_file

    if opt.nlp_tool == "spacy":
        if isTraining:
            sentences = get_sentences_and_tokens_from_spacy(
                corpus_file, nlp_tool, document.entities)
        else:
            sentences = get_sentences_and_tokens_from_spacy(
                corpus_file, nlp_tool, None)
    elif opt.nlp_tool == "nltk":
        if isTraining:
            sentences = get_sentences_and_tokens_from_nltk(
                corpus_file, nlp_tool, document.entities, None, None)
        else:
            sentences = get_sentences_and_tokens_from_nltk(
                corpus_file, nlp_tool, None, None, None)
    elif opt.nlp_tool == "stanford":
        if isTraining:
            sentences = get_sentences_and_tokens_from_stanford(
                corpus_file, nlp_tool, document.entities)
        else:
            sentences = get_sentences_and_tokens_from_stanford(
                corpus_file, nlp_tool, None)
    else:
        raise RuntimeError("invalid nlp tool")

    document.sentences = sentences

    return document, ct_snomed, ct_meddra, ct_unnormed
Exemplo n.º 5
0
def load_metamap_result_from_file(file_path):
    re_brackets = re.compile(r'\[[0-9|/]+\]')
    document = Document()
    entities = []
    with codecs.open(file_path, 'r', 'UTF-8') as fp:
        for line in fp.readlines():
            fields = line.strip().split(u"|")

            if fields[1] != u'MMI':
                continue

            ID = fields[
                0]  # Unique identifier used to identify text being processed. If no identifier is found in the text, 00000000 will be displayed
            MMI = fields[1]  # Always MMI
            Score = fields[
                2]  # MetaMap Indexing (MMI) score with a maximum score of 1000.00
            UMLS_Prefer_Name = fields[
                3]  # The UMLS preferred name for the UMLS concept
            UMLS_ID = fields[4]  # The CUI for the identified UMLS concept.
            Semantic_Type_List = fields[
                5]  # Comma-separated list of Semantic Type abbreviations
            Trigger_Information = fields[
                6]  # Comma separated sextuple showing what triggered MMI to identify this UMLS concept
            Location = fields[
                7]  # Summarizes where UMLS concept was found. TI – Title, AB – Abstract, TX – Free Text, TI;AB – Title and Abstract
            Positional_Information = fields[
                8]  # Semicolon-separated list of positional-information terns, showing StartPos, slash (/), and Length of each trigger identified in the Trigger Information field
            Treecode = fields[
                9]  # Semicolon-separated list of any MeSH treecode

            triggers = Trigger_Information[1:-1].split(u",\"")
            spans = Positional_Information.split(u";")
            if len(triggers) != len(spans):
                raise RuntimeError(
                    "the number of triggers is not equal to that of spans: {} in {}"
                    .format(UMLS_ID, file_path[file_path.rfind('/') + 1:]))

            for idx, span in enumerate(spans):
                bracket_spans = re_brackets.findall(span)
                if len(bracket_spans) == 0:  # simple form
                    if span.find(u',') != -1:
                        logging.debug(
                            "ignore non-continuous form of Positional_Information: {} in {}"
                            .format(triggers[idx],
                                    file_path[file_path.rfind('/') + 1:]))
                        continue

                    tmps = span.split(u"/")
                    entity = Entity()
                    entity.spans.append(
                        [int(tmps[0]),
                         int(tmps[0]) + int(tmps[1])])
                    entity.norm_ids.append(str(UMLS_ID))
                    # "B cell lymphoma"-tx-5-"B cell lymphoma"-noun-0
                    tmps = triggers[idx].split(u"-")

                    if tmps[3].find('"') == -1:
                        logging.debug(
                            "ignore non-string entity: {} in {}".format(
                                tmps[3], file_path[file_path.rfind('/') + 1:]))
                        continue

                    if len(tmps) != 6:
                        logging.debug(
                            "parsing trigger error, ignore entity: {} in {}".
                            format(triggers[idx],
                                   file_path[file_path.rfind('/') + 1:]))
                        continue

                    entity.name = tmps[3][1:-1]  # remove ""

                    entities.append(entity)
                else:
                    for bracket_span in bracket_spans:
                        if bracket_span.find(u',') != -1:
                            logging.debug(
                                "ignore non-continuous form of Positional_Information: {} in {}"
                                .format(triggers[idx],
                                        file_path[file_path.rfind('/') + 1:]))
                            continue

                        tmps = bracket_span[1:-1].split(u"/")
                        entity = Entity()
                        entity.spans.append(
                            [int(tmps[0]),
                             int(tmps[0]) + int(tmps[1])])
                        entity.norm_ids.append(str(UMLS_ID))
                        # "B cell lymphoma"-tx-5-"B cell lymphoma"-noun-0
                        tmps = triggers[idx].split(u"-")

                        if tmps[3].find('"') == -1:
                            logging.debug(
                                "ignore non-string entity: {} in {}".format(
                                    tmps[3],
                                    file_path[file_path.rfind('/') + 1:]))
                            continue

                        if len(tmps) != 6:
                            logging.debug(
                                "parsing trigger error, ignore entity: {} in {}"
                                .format(triggers[idx],
                                        file_path[file_path.rfind('/') + 1:]))
                            continue

                        entity.name = tmps[3][1:-1]

                        entities.append(entity)

    document.entities = entities
    return document