def nct_tagging(index_name, process_ids, stopwords, umls, pos, nprocs=1):
    # open the clinical trail ids file to process
    nct_ids = []
    for line in open(process_ids, 'rb'):
        nct_ids.append(line.strip())

    # Check if index exists
    index = es_index.ElasticSearch_Index(index_name)

    # Create a new sentence index
    sent_index_name = index_name + '_sent'
    sent_index = es_index.ElasticSearch_Index(sent_index_name)
    sent_index.open_index()
    sent_index.add_field('ec_raw_text')
    sent_index.add_field('ec_tags_umls', term_vector=True)

    # Get clinical
    # process each clinical trial and store to XML file
    log.info('processing clinical trials')
    procs = []
    chunksize = int(math.ceil(len(nct_ids) / float(nprocs)))
    for i in xrange(nprocs):
        p = Process(target=_worker,
                    args=(nct_ids[chunksize * i:chunksize * (i + 1)],
                          index_name, sent_index_name, stopwords, umls, pos,
                          (i + 1)))
        procs.append(p)
        p.start()

    for p in procs:
        p.join()
Пример #2
0
def _worker(nct, index_name, host, port_no, stopwords, umls, pos, npr):
    index = es_index.ElasticSearch_Index(index_name, host=host, port=port_no)
    tagger = Tagger(5, stopwords, umls, pos)
    # Iterate over NCT trials
    for i in xrange(1, len(nct) + 1):
        nctid = nct[i - 1]
        # if i == 3:
        #break
        # if nctid != 'NCT00000331':
        # continue
        if i % 500 == 0:
            log.info(' --- core %d: processed %d documents' % (npr, i))

        # Get document from the Elastic Search Index
        doc = index.get_trail(nctid)

        if not doc.has_key('ec_raw_text'):
            continue

        ec = doc['ec_raw_text']

        if ec is None:
            continue

        (pec, jpec) = tagger.process(ec)

        dictlist = []
        for key, value in jpec.iteritems():
            for i in range(value):
                dictlist.append(key)
        doc['ec_tags_umls'] = dictlist
        print nctid, dictlist
        # Index the new document
        index.index_trial(nctid, doc)
def _worker(nct, index_name, sent_index_name, stopwords, umls, pos, npr):
    index = es_index.ElasticSearch_Index(index_name)
    sent_index = es_index.ElasticSearch_Index(sent_index_name)
    for i in xrange(1, len(nct) + 1):
        nctid = nct[i - 1]
        print nctid
        if i % 500 == 0:
            log.info(' --- core %d: indexed %d documents' % (npr, i))

        doc = index.get_trail(nctid)
        ec = doc['ec_raw_text']

        if ec == None:
            continue

        sent_id = 1
        for it in ec:
            sent = ec[it].split(' - ')

            for s in sent:
                tags = {}
                proc = TextProcesser(s, 5, stopwords, umls, pos)
                proc.process()
                for pp in proc.ptxt:
                    freq = tags.setdefault(pp, 0)
                    tags[pp] = freq + 1

                if len(tags) == 0:
                    continue

                dictlist = []
                for key, value in substring_filtering(tags, 1).iteritems():
                    temp = [key, value]
                    dictlist.append(temp)

                # Index the tags
                sent_data = {}
                sent_data['id'] = nctid + '_' + str(sent_id)
                sent_data['ec_raw_text'] = s.strip()
                sent_data['ec_tags_umls'] = dictlist

                sent_index.index_trial(nctid + '_' + str(sent_id), sent_data)

                sent_id += 1
Пример #4
0
def _worker(nct, data_path, index_name, host, port_no, npr):
    index = es_index.ElasticSearch_Index(index_name, host=host, port=port_no)
    parser = ctgov_parser.ClinicalTrial_Parser(data_path)

    for i in xrange(1, len(nct) + 1):
        nctid = nct[i - 1]

        # if i == 3:
        #    break

        if i % 500 == 0:
            log.info(' --- core %d: indexed %d documents' % (npr, i))
        trail_doc = parser.parse(nctid)

        index.index_trial(nctid, trail_doc)
Пример #5
0
def update_ES_index(nct_disease, index_name, host, port_no):
    index = es_index.ElasticSearch_Index(index_name, host=host, port=port_no)

    # Iterate over NCT trials
    for nctid in nct_disease.keys():
        # Get document from the Elastic Search Index
        doc = index.get_trail(nctid)

        if doc is False or not doc.has_key('ec_raw_text'):
            continue

        ec = doc['ec_raw_text']

        if ec is None:
            continue

        doc['disease'] = nct_disease[nctid]
        index.index_trial(nctid, doc)
Пример #6
0
def nct_index(din,
              index_name,
              host='localhost',
              port_no=9200,
              nprocs=1,
              settings_file=None):
    # open the clinical trail ids file and load to a list
    log.info('opening file -- trial_ids.txt')

    nct_ids = []
    for line in open(din + '/trial_ids.txt', 'rb'):
        nct_ids.append(line.strip())

    # Check directories
    trials_din = din + '/trials_xml/'
    if not os.path.exists(trials_din):
        log.error('trials_xml directory does not exists in %s \n' % din)
        exit(0)

    # Create index using the provided settings file
    index = es_index.ElasticSearch_Index(index_name, host=host, port=port_no)
    if settings_file == None:
        index.open_index()
    else:
        index.open_index(settings_file)

    index.add_field('conditions', term_vector=False)
    index.add_field('study_type', term_vector=False)

    # process each clinical trial and store to XML file
    log.info('processing clinical trials')
    procs = []
    chunksize = int(math.ceil(len(nct_ids) / float(nprocs)))
    for i in xrange(nprocs):
        p = Process(target=_worker,
                    args=(nct_ids[chunksize * i:chunksize * (i + 1)],
                          trials_din, index_name, host, port_no, (i + 1)))
        procs.append(p)
        p.start()

    for p in procs:
        p.join()
Пример #7
0
def build_graph(ec_index_name, host, port, neo_url, umls):
    # Initialize Graph
    graph_db = neo4j.GraphDatabaseService(neo_url)

    # Create CDE index
    cde_index = graph_db.get_or_create_index(neo4j.Node, 'cde-index')
    disease_index = graph_db.get_or_create_index(neo4j.Node, 'disease-index')

    # Initialize ElasticSearch Index
    index = es_index.ElasticSearch_Index(ec_index_name, host, port)

    # Get all unique conditions
    conditions = index.get_unique_terms('conditions', min_docs=2)
    log.info(' Found %d conditions' % len(conditions))

    # Add Diseases (conditions) with properties to GraphDB
    batch = neo4j.WriteBatch(graph_db)
    for disease, freq in conditions.iteritems():
        tmp = batch.create(node(name=disease, trail_with_disease=freq))
        batch.set_labels(tmp, 'Disease')
        batch.add_to_index(neo4j.Node, disease_index, 'name', disease, tmp)
    batch.run()
    log.info(' Successfully added %d conditions' % len(conditions))

    # Get all unique CDEs
    # cdes = index.get_unique_terms('ec_tags_umls', min_docs=3)
    #print 'cdes = ', len(cdes)

    cdes = index.get_unique_terms('ec_tags_umls', min_docs=5)
    log.info(' Found %d CDEs' % len(cdes))

    # Add CDEs with properties to GraphDB
    batch = neo4j.WriteBatch(graph_db)
    for cde, freq in cdes.iteritems():
        stype = get_semantic_types(cde, umls)
        tmp = batch.create(node(name=cde, num_trail=freq,
                                semantic_types=stype))
        batch.set_labels(tmp, 'CDE')
        batch.add_to_index(neo4j.Node, cde_index, 'name', cde, tmp)
    batch.run()
    log.info('Successfully added %d CDEs' % len(cdes))

    # For each disease-CDE pair get counts
    rel_count = 0
    i = 0
    for cde in cdes.keys():
        i += 1
        batch = neo4j.WriteBatch(graph_db)
        cde_node = graph_db.get_indexed_node('cde-index', 'name', cde)
        for disease in conditions.keys():
            freq = get_disease_cde_count(index, disease, cde)
            if freq <= 2:
                continue
            rel_count += 1
            disease_node = graph_db.get_indexed_node('disease-index', 'name',
                                                     disease)
            batch.create(rel(cde_node, 'related_to', disease_node, freq=freq))
        if i % 50 == 0:
            log.info('Rel count at %d -- %d out of %d CDEs seen' %
                     (rel_count, i, len(cdes)))
        batch.run()
    log.info('Successfully added %d relationships' % rel_count)