예제 #1
0
def cui_extractor_worker(docSentList, output_file_path):
    """
    extracts all concepts of a given document
    INPUTS: 
    docSentList - list of sentence ids representing a document
    sentDict - dictionary that holds individual sentences, retrieved based on id.
    output_file_path - output file path.
    """
    output_file = open(output_file_path, "a+")
    umls = SQLiteDict(UMLS_DB_LOCATION)
    try:
        for sentNo in docSentList:
            global doc_sent_data
            query = doc_sent_data[str(sentNo).zfill(10)]
            concepts = metamap.retrieve_concepts(query)
            for concept_output in concepts:
                split_concept = concept_output.split("|")
                confidence_score = split_concept[2]
                cui = split_concept[4]
                concept_name = split_concept[3]
                semantic_type = split_concept[5]
                phrase_span = split_concept[8]
                try:
                    concept_definition = umls[cui]
                except KeyError:
                    continue
                output_line = (
                    str(sentNo)
                    + "|"
                    + str(confidence_score)
                    + "|"
                    + cui
                    + "|"
                    + concept_name
                    + "|"
                    + str(concept_definition)
                    + "|"
                    + semantic_type
                    + "|"
                    + phrase_span
                    + "\n"
                )
                output_file.write(output_line)
    finally:
        output_file.close()
예제 #2
0
def generate_concepts(input_dir_path):
    sent_file_path = os.path.join(input_dir_path, "full_text_with_abstract_and_title.metamap")
    pmap_file_path = os.path.join(input_dir_path, "full_text_with_abstract_and_title.metamap.chunkmap")
    output_file_dir = os.path.join(input_dir_path, "concept_files")

    print "STEP 1: reading in document-sentence map file"
    docSentMapBinFile = open(pmap_file_path, "rb")
    map_bin_data = docSentMapBinFile.read()
    sio = StringIO.StringIO(map_bin_data)
    docSentMap = pickle.load(sio)

    doc_sent_data = {}
    sentMapData = open(sent_file_path, "rU")
    for sentData in sentMapData.readlines():
        sentence = sentData.split("|")
        doc_sent_data[sentence[0]] = sentence[1]
    print "STEP 1 COMPLETE"

    print "STEP 2: Generating concept documents "
    try:
        count = 0
        for document in docSentMap.keys():
            output_file_path = os.path.join(output_file_dir, document)
            output_file = open(output_file_path, "a+")
            try:
                umls = SQLiteDict(UMLS_DB_LOCATION)
                try:
                    for sentNo in docSentMap[document]:
                        concepts = metamap.retrieve_concepts(doc_sent_data[str(sentNo).zfill(10)])
                        for concept_output in concepts:
                            split_concept = concept_output.split("|")
                            confidence_score = split_concept[2]
                            cui = split_concept[4]
                            concept_name = split_concept[3]
                            semantic_type = split_concept[5]
                            phrase_span = split_concept[8]
                            concept_dne_file = open(CUI_DNE_FILE, "a+")
                            try:
                                concept_definition = umls[cui]
                            except KeyError:
                                concept_dne_file.write(
                                    document + "|" + str(sentNo) + "|" + phrase_span + "|" + cui + "\n"
                                )
                                continue
                            finally:
                                concept_dne_file.close()
                            output_line = (
                                str(sentNo)
                                + "|"
                                + str(confidence_score)
                                + "|"
                                + cui
                                + "|"
                                + concept_name
                                + "|"
                                + str(concept_definition)
                                + "|"
                                + semantic_type
                                + "|"
                                + phrase_span
                                + "\n"
                            )
                            output_file.write(output_line)
                finally:
                    output_file.close()
                    concept_dne_file.close()
                    count = count + 1
            except IOError:
                print "UMLS DB does not exist. PLEASE RUN umls_db.py before running this program"
            print "PROCESSED %d documents" % count
    except KeyError:
        print "Key not found"