Пример #1
0
def processOneFile_fda(fileName, annotation_dir, nlp_tool, isTraining, types,
                       type_filter, isFDA2018, isNorm):
    documents = []
    annotation_file = get_fda_file(join(annotation_dir, fileName))

    # each section is a document
    for section in annotation_file.sections:
        document = Document()
        document.name = fileName[:fileName.find('.')] + "_" + section.id
        if section.text is None:
            document.text = ""
            document.entities = []
            document.sentences = []
            documents.append(document)
            continue

        document.text = section.text

        entities = []

        if isFDA2018 == False and isNorm == True:
            for reaction in annotation_file.reactions:
                entity = Entity()
                entity.name = reaction.name
                for normalization in reaction.normalizations:
                    entity.norm_ids.append(
                        normalization.meddra_pt_id)  # can be none
                    entity.norm_names.append(normalization.meddra_pt)
                entities.append(entity)

        else:
            for entity in annotation_file.mentions:
                if entity.section != section.id:
                    continue
                if types and (entity.type not in type_filter):
                    continue
                entities.append(entity)

        document.entities = entities

        if opt.nlp_tool == "nltk":
            if isTraining:
                sentences = get_sentences_and_tokens_from_nltk(
                    section.text, nlp_tool, document.entities,
                    annotation_file.ignore_regions, section.id)
            else:
                sentences = get_sentences_and_tokens_from_nltk(
                    section.text, nlp_tool, None,
                    annotation_file.ignore_regions, section.id)
        else:
            raise RuntimeError("invalid nlp tool")

        document.sentences = sentences

        documents.append(document)

    return documents, annotation_file
Пример #2
0
def processOneFile_fda(fileName, annotation_dir, types, type_filter, isFDA2018,
                       isNorm):
    documents = []
    annotation_file = get_fda_file(os.path.join(annotation_dir, fileName))

    # each section is a document
    for section in annotation_file.sections:
        document = Document()
        document.name = fileName[:fileName.find('.')] + "_" + section.id
        if section.text is None:
            document.text = ""
            document.entities = []
            document.sentences = []
            documents.append(document)
            continue

        document.text = section.text

        entities = []

        if isFDA2018 == False and isNorm == True:
            for reaction in annotation_file.reactions:
                entity = Entity()
                entity.name = reaction.name
                for normalization in reaction.normalizations:
                    entity.norm_ids.append(
                        normalization.meddra_pt_id)  # can be none
                    entity.norm_names.append(normalization.meddra_pt)
                entities.append(entity)

        else:
            for entity in annotation_file.mentions:
                if entity.section != section.id:
                    continue
                if types and (entity.type not in type_filter):
                    continue
                entities.append(entity)

        document.entities = entities

        document.sentences = []

        documents.append(document)

    return documents, annotation_file
Пример #3
0
def processOneFile(fileName, annotation_dir, corpus_dir, nlp_tool, isTraining,
                   types, type_filter):
    document = Document()
    document.name = fileName[:fileName.find('.')]

    ct_snomed = 0
    ct_meddra = 0
    ct_unnormed = 0

    if annotation_dir:
        annotation_file = get_bioc_file(join(annotation_dir, fileName))
        bioc_passage = annotation_file[0].passages[0]
        entities = []

        for entity in bioc_passage.annotations:
            if types and (entity.infons['type'] not in type_filter):
                continue
            entity_ = Entity()
            entity_.id = entity.id
            processed_name = entity.text.replace('\\n', ' ')
            if len(processed_name) == 0:
                logging.debug("{}: entity {} name is empty".format(
                    fileName, entity.id))
                continue
            entity_.name = processed_name

            entity_.type = entity.infons['type']
            entity_.spans.append(
                [entity.locations[0].offset, entity.locations[0].end])
            if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A')\
                    and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'):
                entity_.norm_ids.append(entity.infons['SNOMED code'])
                entity_.norm_names.append(entity.infons['SNOMED term'])
                ct_snomed += 1
            elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A')\
                    and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'):
                entity_.norm_ids.append(entity.infons['MedDRA code'])
                entity_.norm_names.append(entity.infons['MedDRA term'])
                ct_meddra += 1
            else:
                logging.debug("{}: no norm id in entity {}".format(
                    fileName, entity.id))
                ct_unnormed += 1
                continue

            entities.append(entity_)

        document.entities = entities

    corpus_file = get_text_file(join(corpus_dir, fileName.split('.bioc')[0]))
    document.text = corpus_file

    if opt.nlp_tool == "spacy":
        if isTraining:
            sentences = get_sentences_and_tokens_from_spacy(
                corpus_file, nlp_tool, document.entities)
        else:
            sentences = get_sentences_and_tokens_from_spacy(
                corpus_file, nlp_tool, None)
    elif opt.nlp_tool == "nltk":
        if isTraining:
            sentences = get_sentences_and_tokens_from_nltk(
                corpus_file, nlp_tool, document.entities, None, None)
        else:
            sentences = get_sentences_and_tokens_from_nltk(
                corpus_file, nlp_tool, None, None, None)
    elif opt.nlp_tool == "stanford":
        if isTraining:
            sentences = get_sentences_and_tokens_from_stanford(
                corpus_file, nlp_tool, document.entities)
        else:
            sentences = get_sentences_and_tokens_from_stanford(
                corpus_file, nlp_tool, None)
    else:
        raise RuntimeError("invalid nlp tool")

    document.sentences = sentences

    return document, ct_snomed, ct_meddra, ct_unnormed