Exemplo n.º 1
0
def main():

    logger.debug("Reading config parameters from file - Start")
    cfg = ConfigParserHelper()
    print cfg.get('main', 'title')

    #Get the config parameters
    filename = cfg.get('main','input_filename')                        #inp file with pmids
    source_type = cfg.get('main','source_type')                        #source type - currently supports only 'Pubmed'
    java_path = cfg.get('ner','java_path')                             #java path to run the MedKATp jar
    medkat_conf = cfg.get('ner','medkat_conf_file')                    #MedKATp config file
    txt_input = cfg.get('ner','txt_inp_filepath')                      #MedKATp input folder with txt files
    xmi_output = cfg.get('ner','xmi_out_filepath')                     #MedKATp output folder with xmi files
    enable = cfg.get('ner','enable')                                   #To enable/disable running ner
    num_clusters = cfg.get('cluster','num_clusters')                   #Number of clusters
    num_words_per_cluster = cfg.get('cluster','num_words_per_cluster') #Number of top words to be displayed per cluster

    logger.info("Reading config parameters - Done")
    print("Reading config parameters - Done")

    logger.debug("Retrieving pmids from file - Start")
    #open a file object and read the lines in the file
    with open(filename) as f:
        lines = f.read().splitlines()
    logger.info("Retrieving pmids from file - Done")
    print("Retrieving pmids from file - Done")

    logger.debug("Retrieving and storing Pubmed data - Start")
    #If the source type is Pubmed, process the list of pmids to get the data from pubmed
    if(source_type=='Pubmed'):
        pubdata = PubmedData()
        pubdata.getPubmedData(lines)
    logger.info("Retrieving and storing Pubmed data - Done")
    print("Retrieving and storing Pubmed data - Done")

    #Call InputHelper to create the input text files for processing
    ihelper = InputHelper()
    xmi_parser = XMIParser()

    #If NER is set to true, prepare input and run the MedKATp pipeline and run the analysis on the terms identified
    #Else run the analysis on the complete text
    if enable.lower() == 'true':
        logger.debug("NER enable set to true")

        logger.debug("Creating input and output directories for NER and preparing input - Start")
        ihelper.create_input_files(filename, txt_input, xmi_output)
        logger.info("Creating input and output directories for NER and preparing input - Done")
        print("Creating input and output directories for NER and preparing input - Done")

        logger.debug("Running text file processing (NER) on input text - Start")
        #start the required environment variables and run a Java process for NER by calling the MedKATp pipeline and block until done
        env = dict(os.environ)
        env["MEDKATp_INP_CONFIG_FILE"] = medkat_conf
        env["CLASSPATH"] = ".:lib/*:MedKATp-20160109.jar:/opt/nlp/MedKATp/resources/"

        import subprocess
        subprocess.call([java_path, '-Xmx1024m', '-Xms256m', 'nlp.pubmed.ner.inputReader.MedKATRunner'],env=env)
        logger.info("Running text file processing (NER) on input text - Done")
        print("Running text file processing (NER) on input text - Done")

        logger.debug("Running analysis an clustering on NER output - Start")
        xmi_parser.parse_xmi_files(xmi_output, num_clusters, num_words_per_cluster)
        logger.info("Running analysis an clustering on NER output - Done")
        #print("Running analysis an clustering on NER output - Done")

    else:
        logger.debug("NER enable set to false")

        pmids_array = []
        text_array = []

        logger.debug("Preparing input for analysis and clustering- Start")
        input_list = ihelper.create_input(filename)
        for key in input_list.keys():
            pmids_array.append(key)
            text_array.append(input_list[key])
        logger.info("Preparing input for analysis and clustering- Done")
        print("Preparing input for analysis and clustering- Done")

        logger.debug("Running analysis and clustering on text - Start")
        xmi_parser.analyze_and_cluster(pmids_array, text_array, num_clusters, num_words_per_cluster)
        logger.info("Running analysis and clustering on text - Done")
Exemplo n.º 2
0
Arquivo: db.py Projeto: htaox/nlp
#Not used currently, for extensibility
class ArticleSentence(Base):
    __tablename__ = 'articlesentence'
    id = Column(Integer, primary_key=True)
    #sentenceID = Column(Integer)
    articleID = Column(Integer, ForeignKey('article.PMID'))
    startPos = Column(Integer)
    endPos = Column(Integer)
    #sentIndex = Column(Integer)

class DBHelper:

    session = None

    def __init__(self):
        engine = create_engine(cfg.get('main', 'db_conn_string'))
        # create a configured "Session" class
        Session = sessionmaker(bind=engine)

        # create a Session
        self.session = Session()


cfg = ConfigParserHelper()
engine = create_engine(cfg.get('main', 'db_conn_string'))

from sqlalchemy.orm import sessionmaker
session = sessionmaker()
session.configure(bind=engine)
Base.metadata.create_all(engine)
Exemplo n.º 3
0
class ArticleSentence(Base):
    __tablename__ = 'articlesentence'
    id = Column(Integer, primary_key=True)
    #sentenceID = Column(Integer)
    articleID = Column(Integer, ForeignKey('article.PMID'))
    startPos = Column(Integer)
    endPos = Column(Integer)
    #sentIndex = Column(Integer)


class DBHelper:

    session = None

    def __init__(self):
        engine = create_engine(cfg.get('main', 'db_conn_string'))
        # create a configured "Session" class
        Session = sessionmaker(bind=engine)

        # create a Session
        self.session = Session()


cfg = ConfigParserHelper()
engine = create_engine(cfg.get('main', 'db_conn_string'))

from sqlalchemy.orm import sessionmaker
session = sessionmaker()
session.configure(bind=engine)
Base.metadata.create_all(engine)