def nct_tagging(index_name, process_ids, stopwords, umls, pos, nprocs=1): # open the clinical trail ids file to process nct_ids = [] for line in open(process_ids, 'rb'): nct_ids.append(line.strip()) # Check if index exists index = es_index.ElasticSearch_Index(index_name) # Create a new sentence index sent_index_name = index_name + '_sent' sent_index = es_index.ElasticSearch_Index(sent_index_name) sent_index.open_index() sent_index.add_field('ec_raw_text') sent_index.add_field('ec_tags_umls', term_vector=True) # Get clinical # process each clinical trial and store to XML file log.info('processing clinical trials') procs = [] chunksize = int(math.ceil(len(nct_ids) / float(nprocs))) for i in xrange(nprocs): p = Process(target=_worker, args=(nct_ids[chunksize * i:chunksize * (i + 1)], index_name, sent_index_name, stopwords, umls, pos, (i + 1))) procs.append(p) p.start() for p in procs: p.join()
def _worker(nct, index_name, host, port_no, stopwords, umls, pos, npr): index = es_index.ElasticSearch_Index(index_name, host=host, port=port_no) tagger = Tagger(5, stopwords, umls, pos) # Iterate over NCT trials for i in xrange(1, len(nct) + 1): nctid = nct[i - 1] # if i == 3: #break # if nctid != 'NCT00000331': # continue if i % 500 == 0: log.info(' --- core %d: processed %d documents' % (npr, i)) # Get document from the Elastic Search Index doc = index.get_trail(nctid) if not doc.has_key('ec_raw_text'): continue ec = doc['ec_raw_text'] if ec is None: continue (pec, jpec) = tagger.process(ec) dictlist = [] for key, value in jpec.iteritems(): for i in range(value): dictlist.append(key) doc['ec_tags_umls'] = dictlist print nctid, dictlist # Index the new document index.index_trial(nctid, doc)
def _worker(nct, index_name, sent_index_name, stopwords, umls, pos, npr): index = es_index.ElasticSearch_Index(index_name) sent_index = es_index.ElasticSearch_Index(sent_index_name) for i in xrange(1, len(nct) + 1): nctid = nct[i - 1] print nctid if i % 500 == 0: log.info(' --- core %d: indexed %d documents' % (npr, i)) doc = index.get_trail(nctid) ec = doc['ec_raw_text'] if ec == None: continue sent_id = 1 for it in ec: sent = ec[it].split(' - ') for s in sent: tags = {} proc = TextProcesser(s, 5, stopwords, umls, pos) proc.process() for pp in proc.ptxt: freq = tags.setdefault(pp, 0) tags[pp] = freq + 1 if len(tags) == 0: continue dictlist = [] for key, value in substring_filtering(tags, 1).iteritems(): temp = [key, value] dictlist.append(temp) # Index the tags sent_data = {} sent_data['id'] = nctid + '_' + str(sent_id) sent_data['ec_raw_text'] = s.strip() sent_data['ec_tags_umls'] = dictlist sent_index.index_trial(nctid + '_' + str(sent_id), sent_data) sent_id += 1
def _worker(nct, data_path, index_name, host, port_no, npr): index = es_index.ElasticSearch_Index(index_name, host=host, port=port_no) parser = ctgov_parser.ClinicalTrial_Parser(data_path) for i in xrange(1, len(nct) + 1): nctid = nct[i - 1] # if i == 3: # break if i % 500 == 0: log.info(' --- core %d: indexed %d documents' % (npr, i)) trail_doc = parser.parse(nctid) index.index_trial(nctid, trail_doc)
def update_ES_index(nct_disease, index_name, host, port_no): index = es_index.ElasticSearch_Index(index_name, host=host, port=port_no) # Iterate over NCT trials for nctid in nct_disease.keys(): # Get document from the Elastic Search Index doc = index.get_trail(nctid) if doc is False or not doc.has_key('ec_raw_text'): continue ec = doc['ec_raw_text'] if ec is None: continue doc['disease'] = nct_disease[nctid] index.index_trial(nctid, doc)
def nct_index(din, index_name, host='localhost', port_no=9200, nprocs=1, settings_file=None): # open the clinical trail ids file and load to a list log.info('opening file -- trial_ids.txt') nct_ids = [] for line in open(din + '/trial_ids.txt', 'rb'): nct_ids.append(line.strip()) # Check directories trials_din = din + '/trials_xml/' if not os.path.exists(trials_din): log.error('trials_xml directory does not exists in %s \n' % din) exit(0) # Create index using the provided settings file index = es_index.ElasticSearch_Index(index_name, host=host, port=port_no) if settings_file == None: index.open_index() else: index.open_index(settings_file) index.add_field('conditions', term_vector=False) index.add_field('study_type', term_vector=False) # process each clinical trial and store to XML file log.info('processing clinical trials') procs = [] chunksize = int(math.ceil(len(nct_ids) / float(nprocs))) for i in xrange(nprocs): p = Process(target=_worker, args=(nct_ids[chunksize * i:chunksize * (i + 1)], trials_din, index_name, host, port_no, (i + 1))) procs.append(p) p.start() for p in procs: p.join()
def build_graph(ec_index_name, host, port, neo_url, umls): # Initialize Graph graph_db = neo4j.GraphDatabaseService(neo_url) # Create CDE index cde_index = graph_db.get_or_create_index(neo4j.Node, 'cde-index') disease_index = graph_db.get_or_create_index(neo4j.Node, 'disease-index') # Initialize ElasticSearch Index index = es_index.ElasticSearch_Index(ec_index_name, host, port) # Get all unique conditions conditions = index.get_unique_terms('conditions', min_docs=2) log.info(' Found %d conditions' % len(conditions)) # Add Diseases (conditions) with properties to GraphDB batch = neo4j.WriteBatch(graph_db) for disease, freq in conditions.iteritems(): tmp = batch.create(node(name=disease, trail_with_disease=freq)) batch.set_labels(tmp, 'Disease') batch.add_to_index(neo4j.Node, disease_index, 'name', disease, tmp) batch.run() log.info(' Successfully added %d conditions' % len(conditions)) # Get all unique CDEs # cdes = index.get_unique_terms('ec_tags_umls', min_docs=3) #print 'cdes = ', len(cdes) cdes = index.get_unique_terms('ec_tags_umls', min_docs=5) log.info(' Found %d CDEs' % len(cdes)) # Add CDEs with properties to GraphDB batch = neo4j.WriteBatch(graph_db) for cde, freq in cdes.iteritems(): stype = get_semantic_types(cde, umls) tmp = batch.create(node(name=cde, num_trail=freq, semantic_types=stype)) batch.set_labels(tmp, 'CDE') batch.add_to_index(neo4j.Node, cde_index, 'name', cde, tmp) batch.run() log.info('Successfully added %d CDEs' % len(cdes)) # For each disease-CDE pair get counts rel_count = 0 i = 0 for cde in cdes.keys(): i += 1 batch = neo4j.WriteBatch(graph_db) cde_node = graph_db.get_indexed_node('cde-index', 'name', cde) for disease in conditions.keys(): freq = get_disease_cde_count(index, disease, cde) if freq <= 2: continue rel_count += 1 disease_node = graph_db.get_indexed_node('disease-index', 'name', disease) batch.create(rel(cde_node, 'related_to', disease_node, freq=freq)) if i % 50 == 0: log.info('Rel count at %d -- %d out of %d CDEs seen' % (rel_count, i, len(cdes))) batch.run() log.info('Successfully added %d relationships' % rel_count)