''' Index a collection of clinical trial eligiblity criteria using the frequent tags (controlled vocabulary) @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu> ''' from lib.cvocab.textprocesser import TextProcesser from lib.cvocab.cvalue import substring_filtering from multiprocessing import Process, Queue from lib.utility.log import strd_logger import math, re log = strd_logger('ec-indexer') def indexer(docs, cvocab, ngr=5, stop=None, umls=None, rneg=None, nprocs=1): # worker def worker(docs, ldoc, cvocab, ngr, stop, umls, rneg, qout, p): ptxt = {} i = 1 cdiv = 0 for d in ldoc: if i % 500 == 0: log.info(' --- core %d: processed %d documents' % (p, i)) i += 1 if docs[d][1] is not None: (pec, c) = _index_ec(docs[d][1], cvocab, ngr, stop, umls, rneg) ptxt[d] = docs[d][0] | pec cdiv += c qout.put((ptxt, cdiv))
''' Extract relevant tags from a text @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu> ''' import nltk, string, itertools from lib.nlp import negex from lib.utility.log import strd_logger log = strd_logger('ctec-tag-mining') conj = set(['and', 'or']) class TextProcesser: # constructor def __init__(self, text, ngram=5, stop=None, umls=None, ptag=None, negrule=None, avocab=None): try: self.text = str(text.lower().strip()) except UnicodeEncodeError: self.text = str(text.lower().strip().encode('utf-8')) self.text = self.text.replace('- ', ' ').replace(' -', ' ')
''' Mine a controlled vocabulary of tags from a collection of documents @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu> ''' from .textprocesser import TextProcesser import math, numpy from .cvalue import substring_filtering from multiprocessing import Process, Queue from lib.utility.log import strd_logger log = strd_logger('tag-miner') def tag_miner (docs, freq = 0.01, ngr = 5, stop = None, umls = None, ptag = None, nprocs = 1): # worker def worker (docs, ldoc, ngr, stop, umls, ptag, qout, p): ptxt = {} idf = {} i = 1 for d in ldoc: if i % 500 == 0: log.info (' --- core %d: processed %d documents' % (p,i)) i += 1 pdoc = _parse_text (docs[d], ngr, stop, umls, ptag) if not pdoc: continue ptxt[d] = pdoc for t in ptxt[d]: val = idf.setdefault (t,0)
''' Function to Interact with ClinicalTrials.gov @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu> ''' import re, math from lib.utility.web import download_web_data import xml.etree.ElementTree as pxml from lib.utility.log import strd_logger from xml.etree import ElementTree import lib.utility.file as ufile log = strd_logger('arule-mining') # list of all trials available def get_clinical_trials(): ''' url = 'http://clinicaltrials.gov/ct2/crawl' html = download_web_data (url) pages = re.findall (r'href="/ct2/crawl/(\d+)"', html) lnct = set() print (lnct) for p in pages: html = None while html is None: html = download_web_data ('%s/%s' % (url, p)) ct = re.findall (r'href="/ct2/show/(NCT\d+)"', html) lnct |= set(ct) '''