""" Time Analysis of CDEs @author: Riccardo Miotto """ import argparse from matplotlib.backends.backend_pdf import PdfPages from pylab import * import numpy as np import matplotlib.pyplot as plt from ctgov.utility.log import strd_logger import ctgov.utility.file as ufile log = strd_logger('cde-time-analysis') def cde_analysis(ddata, dout, ystep=1): if ystep < 1: log.error('the year step needs to be greater than 1 -- interrupting') return dout = '%s/year-step-%d' % (dout, ystep) if not ufile.mkdir(dout): log.error('impossible to create the output directory - interrupting') return # get list of diseases ddata = '%s/year-step-%d' % (ddata, ystep) ldis = sorted(os.walk(ddata).next()[1])
''' Retrieve Disease - NCT associations starting from a list of diseases @author: Riccardo Miotto ''' from ctgov.utility.log import strd_logger from ctgov.utility.web import download_web_data from collections import defaultdict import ctgov.index.es_index as es_index import xml.etree.ElementTree as xml_parser import ctgov.utility.file as ufile import argparse, sys log = strd_logger('disease-nct-association') def mine_disease_to_nct(ldisease, fout=None, ctmin=100): url = 'http://clinicaltrials.gov/search?cond=%s&displayxml=true&count=%s' log.info('found %d disease to process \n' % len(ldisease)) ldisease = sorted(map(lambda x: ' '.join(x.lower().split()), ldisease)) nct_disease = defaultdict(list) c = 1 for d in sorted(ldisease): log.info('processing: "%s"' % d) d = d.replace(',', '') fd = d.replace(' ', '+') # number of trials xmltree = xml_parser.fromstring(download_web_data(url % (fd, '0'))) nres = xmltree.get('count')
@author: Riccardo Miotto <rm3086 (at) columbia (dot) edu> Modified by @author: Praveen Chandar """ from ctgov.utility.log import strd_logger from ctgov.utility.web import clean_text from ctgov.concept_mapping.filters import ConceptFilters from ctgov.concept_mapping.dict_mapping import DictionaryMapping from itertools import groupby import math import nltk import string log = strd_logger('concept-tagger') class Tagger: # constructor def __init__(self, ngram=5, stop=None, umls=None, ptag=None): self.filter = ConceptFilters(ngram, stop, ptag) self.mapper = DictionaryMapping(umls) self.ngram = ngram def process_text(self, text): ptxt = self.process_section(text) return ptxt def process(self, ec_dict): pec = {}
""" <Module Explanation> @author: Praveen Chandar """ from ctgov.utility.log import strd_logger from datetime import datetime from ctgov.utility.web import clean_text import xml.etree.ElementTree as xml_parser import math import re log = strd_logger('ctgov-parser') class ClinicalTrial_Parser(object): def __init__(self, data_path): self.data_path = data_path def parse(self, nct_id): try: trail_path = self.data_path + '/' + nct_id + '.xml' xml = xml_parser.parse(trail_path) # general doc = {} doc['title'] = self.__get_info(xml, 'brief_title') doc['study_type'] = self.__get_info(xml, 'study_type') # Add conditions cond = xml.findall('condition') conditions = []
""" The module contains functions to connect to the elastic search index. @author: Praveen Chandar """ from ctgov.utility.log import strd_logger from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q from elasticsearch import ConnectionError import json log = strd_logger('elasticsearch-index') class ElasticSearch_Index(object): def __init__(self, index_name, host='localhost', port=9200): self.index_name = index_name self.host_name = host self.port_number = port self.doc_type = 'trial' self.es = self.get_es_conn() def get_es_conn(self): """ Create an ElasticSearch() object :return: Elasticsearch() instance """ assert isinstance(self.host_name, str)
''' Extract relevant tags from a text @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu> ''' import nltk, string, itertools from ctgov.utility.log import strd_logger from ctgov.utility.web import clean_text log = strd_logger('textprocesser') conj = set(['and', 'or']) class TextProcesser: # constructor def __init__(self, text, ngram=5, stop=None, umls=None, ptag=None): try: self.text = str(text) except UnicodeEncodeError: self.text = str(text.encode('utf-8')) self.text = self.text.lower().strip() self.text = self.text.replace('- ', ' ').replace(' -', ' ') # get filtering data self.ngr = ngram if not stop: self.stop = (set(), set()) else: self.stop = stop self.umls = umls
""" Retrieve Clinical Trials and Stores it onto a directory @author: Praveen Chandar """ from ctgov.utility.log import strd_logger from multiprocessing import Process, Queue import argparse, sys, math import urllib2, urllib3, json import os, shutil import re log = strd_logger('nct-processer') # create directory (delete if one with the same name already exists) def mkdir(dirname, force_create=False): try: os.makedirs(dirname) except OSError: if force_create: shutil.rmtree(dirname) os.makedirs(dirname) else: pass except Exception as e: log.error(e) return False return True def download_web_data(url): try:
""" Clinical Trial representation @author: Riccardo Miotto """ from ctgov.concept_mapping.textprocesser import TextProcesser from ctgov.concept_mapping.cvalue import substring_filtering from ctgov.utility.web import download_web_data from ctgov.utility.log import strd_logger from datetime import datetime import xml.etree.ElementTree as xml_parser import math, re log = strd_logger('clinical-trial') class ClinicalTrial(object): def __init__(self, nctid, data_path): self.trail_path = data_path + '/' + nctid + '.xml' self.id = nctid self.title = None self.condition = set() self.study_type = None self.start_date = None self.firstreceived_date = None self.verification_date = None self.lastchanged_date = None self.completion_date = None self.gender = None self.minimum_age = None self.maximum_age = None
''' Mine CDEs from a collection of trials associated to a disease @author: Riccardo Miotto ''' from cvalue import substring_filtering from ctgov.utility.log import strd_logger import math, numpy, operator log = strd_logger('cde') ''' mine the CDEs ''' def cde_miner(pnct, tags, freq=0.01, umls=None): # mine CDEs cde = _mine_cde(pnct, freq, tags) log.info('------ retained %d CDEs' % len(cde)) # assign cde to inclusion and exclusion ie_cde = {} for k, ct in pnct.iteritems(): for it in ct.pec: itdict = ie_cde.setdefault(it, {}) for t in ct.pec[it]: if t in cde: v = itdict.setdefault(t, 0) itdict[t] = v + 1 ie_cde[it] = itdict
""" Retrieve Clinical Trials and Stores it onto a directory @author: Praveen Chandar """ from ctgov.utility.log import strd_logger from multiprocessing import Process, Queue import ctgov.utility.file as file_utils import ctgov.index.es_index as es_index import ctgov.index.ctgov_parser as ctgov_parser import argparse, sys, math import os log = strd_logger('nct-indexer') def nct_index(din, index_name, host='localhost', port_no=9200, nprocs=1, settings_file=None): # open the clinical trail ids file and load to a list log.info('opening file -- trial_ids.txt') nct_ids = [] for line in open(din + '/trial_ids.txt', 'rb'): nct_ids.append(line.strip()) # Check directories trials_din = din + '/trials_xml/' if not os.path.exists(trials_din): log.error('trials_xml directory does not exists in %s \n' % din) exit(0)
""" Function to Interact with ClinicalTrials.gov @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu> Modified on Sep 15th 2014 @author: Praveen Chandar < (at) columbia (dot) edu > """ import re from ctgov.utility.web import download_web_data from ctgov.utility.log import strd_logger log = strd_logger('ctgov-fetch') def get_clinical_trials(): """ Obtains the latest list of all clinical trials from clinicaltrails.gov :return: """ url = 'http://clinicaltrials.gov/ct2/crawl' html = download_web_data(url) pages = re.findall(r'href="/ct2/crawl/(\d+)"', html) lnct = set() for p in pages: html = download_web_data('%s/%s' % (url, p)) ct = re.findall(r'href="/ct2/show/(NCT\d+)"', html) lnct |= set(ct) return sorted(lnct)
""" Mine CDEs for a set of disease @param yearstep: step for time analysis @author: Riccardo Miotto """ from ctgov.utility.log import strd_logger from ctgov.miner.cde import cde_miner from ctgov.load_data import load_umls from datetime import timedelta import ctgov.utility.file as ufile import argparse, sys, math, datetime, zipfile, os, shutil log = strd_logger('cde-concept_mapping') def mining_cde(nct, disease, nctmin=100, fth=0.03, umls=None, dout=None, yearstep=-1): if yearstep <= 0: yearstep = -1 # get year interval yi = _year_interval(yearstep) # check output directory if not _check_dout(dout): return if yearstep == -1: dout = '%s/all-years' % dout else: dout = '%s/year-step-%d' % (dout, yearstep)
""" <Module Explanation> @author: Praveen Chandar """ import itertools from ctgov.utility.log import strd_logger log = strd_logger('dict-mapping') class DictionaryMapping(object): def __init__(self, umls): self.use_scramble_find = True self.use_split_dashed_words = True self.conj = {'and', 'or'} self.umls = umls def map(self, tokens): if not self.umls: log.warning('UMLS not loaded') return [] # First do direct mapping tags = self._direct_mapping(tokens) # If simple direct mapping fails, try other options if tags is None and self.use_scramble_find: tags = self._scramble_find(tokens) # If scrambling fails, look for dashed words if tags is None and self.use_split_dashed_words:
""" Time Analysis of CDEs @author: Riccardo Miotto """ import argparse from matplotlib.backends.backend_pdf import PdfPages from pylab import * import numpy as np import matplotlib.pyplot as plt from ctgov.utility.log import strd_logger import ctgov.utility.file as ufile log = strd_logger('cde-time-analysis') def cde_analysis(ddata, dout, ystep=1): if ystep < 1: log.error('the year step needs to be greater than 1 -- interrupting') return dout = '%s/year-step-%d' % (dout, ystep) if not ufile.mkdir(dout): log.error('impossible to create the output directory - interrupting') return # get list of diseases ddata = '%s/year-step-%d' % (ddata, ystep) ldis = sorted(os.walk(ddata).next()[1]) yinterval = _year_interval(ystep)
""" Mine CDEs for a set of disease @param yearstep: step for time analysis @author: Riccardo Miotto """ from ctgov.utility.log import strd_logger from ctgov.miner.cde import cde_miner from ctgov.load_data import load_umls from datetime import timedelta import ctgov.utility.file as ufile import argparse, sys, math, datetime, zipfile, os, shutil log = strd_logger('cde-concept_mapping') def mining_cde(nct, disease, nctmin=100, fth=0.03, umls=None, dout=None, yearstep=-1): if yearstep <= 0: yearstep = -1 # get year interval yi = _year_interval(yearstep) # check output directory
""" Retrieve Clinical Trials and Stores it onto a directory @author: Praveen Chandar """ from ctgov.utility.log import strd_logger from multiprocessing import Process, Queue import ctgov.utility.file as file_utils import ctgov.index.es_index as es_index import ctgov.index.ctgov_parser as ctgov_parser import argparse, sys, math import os log = strd_logger('nct-indexer') def nct_index(din, index_name, host='localhost', port_no=9200, nprocs=1, settings_file=None): # open the clinical trail ids file and load to a list log.info('opening file -- trial_ids.txt') nct_ids = [] for line in open(din + '/trial_ids.txt', 'rb'): nct_ids.append(line.strip()) # Check directories trials_din = din + '/trials_xml/'
""" Apply tagging process @author: Praveen Chandar """ from ctgov.load_data import load_data from ctgov.utility.log import strd_logger from multiprocessing import Process import ctgov.index.es_index as es_index from ctgov.concept_mapping.tagger import Tagger import argparse import sys import math log = strd_logger('tag-miner') def nct_tagging(index_name, host, port_no, process_ids, stopwords, umls, pos, nprocs=1): # open the clinical trail ids file to process nct_ids = [] for line in open(process_ids, 'rb'): nct_ids.append(line.strip())
''' Mine CDEs from a collection of trials associated to a disease @author: Riccardo Miotto ''' from cvalue import substring_filtering from ctgov.utility.log import strd_logger import math, numpy, operator log = strd_logger('cde') ''' mine the CDEs ''' def cde_miner(pnct, tags, freq=0.01, umls=None): # mine CDEs cde = _mine_cde(pnct, freq, tags) log.info('------ retained %d CDEs' % len(cde)) # assign cde to inclusion and exclusion ie_cde = {} for k, ct in pnct.iteritems(): for it in ct.pec: itdict = ie_cde.setdefault(it, {}) for t in ct.pec[it]: if t in cde: v = itdict.setdefault(t, 0) itdict[t] = v + 1
''' Retrieve and Process Clinical Trials (extract n-grams from eligibility criteria) @author: Riccardo Miotto ''' from ctgov.utility.log import strd_logger from ctgov.load_data import load_data from multiprocessing import Process, Queue from ctgov.index._clinicaltrial import ClinicalTrial import ctgov.utility.file as ufile import ctgov.index as ctgov import argparse, sys, math log = strd_logger('nct-processer') def nct_processer(dout, stop=None, umls=None, ptag=None, nprocs=1): # get the list of clinical trials log.info('downloading the list of clinical trials') nct = index.get_clinical_trials() if len(nct) == 0: log.error(' --- not found any clinical trials - interrupting \n') return log.info(' --- found %d clinical trials \n' % len(nct)) # process each clinical trial log.info('processing clinical trials') qout = Queue() procs = [] chunksize = int(math.ceil(len(nct) / float(nprocs)))
""" Apply tagging process @author: Praveen Chandar """ from ctgov.load_data import load_data from ctgov.utility.log import strd_logger from multiprocessing import Process import ctgov.index.es_index as es_index from ctgov.concept_mapping.tagger import Tagger import argparse import sys import math log = strd_logger('tag-miner') def nct_tagging(index_name, host, port_no, process_ids, stopwords, umls, pos, nprocs=1): # open the clinical trail ids file to process nct_ids = [] for line in open(process_ids, 'rb'): nct_ids.append(line.strip()) # Check if index exists index = es_index.ElasticSearch_Index(index_name, host=host, port=port_no) index.add_field('ec_tags_umls', term_vector=True) # Get clinical