'''
 Index a collection of clinical trial eligiblity criteria using the frequent tags (controlled vocabulary)
 
 @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu>
'''

from lib.cvocab.textprocesser import TextProcesser
from lib.cvocab.cvalue import substring_filtering
from multiprocessing import Process, Queue
from lib.utility.log import strd_logger
import math, re

log = strd_logger('ec-indexer')


def indexer(docs, cvocab, ngr=5, stop=None, umls=None, rneg=None, nprocs=1):
    # worker
    def worker(docs, ldoc, cvocab, ngr, stop, umls, rneg, qout, p):
        ptxt = {}
        i = 1
        cdiv = 0
        for d in ldoc:
            if i % 500 == 0:
                log.info(' --- core %d: processed %d documents' % (p, i))
            i += 1
            if docs[d][1] is not None:
                (pec, c) = _index_ec(docs[d][1], cvocab, ngr, stop, umls, rneg)
                ptxt[d] = docs[d][0] | pec
                cdiv += c
        qout.put((ptxt, cdiv))
示例#2
0
'''
 Extract relevant tags from a text

  @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu>
'''

import nltk, string, itertools
from lib.nlp import negex

from lib.utility.log import strd_logger
log = strd_logger('ctec-tag-mining')

conj = set(['and', 'or'])


class TextProcesser:

    # constructor
    def __init__(self,
                 text,
                 ngram=5,
                 stop=None,
                 umls=None,
                 ptag=None,
                 negrule=None,
                 avocab=None):
        try:
            self.text = str(text.lower().strip())
        except UnicodeEncodeError:
            self.text = str(text.lower().strip().encode('utf-8'))
        self.text = self.text.replace('- ', ' ').replace(' -', ' ')
'''
 Mine a controlled vocabulary of tags from a collection of documents

  @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu>
'''
from .textprocesser import TextProcesser
import math, numpy

from .cvalue import substring_filtering
from multiprocessing import Process, Queue
from lib.utility.log import strd_logger

log = strd_logger('tag-miner')


def tag_miner (docs, freq = 0.01, ngr = 5, stop = None, umls = None, ptag = None, nprocs = 1):
	# worker
	def worker (docs, ldoc, ngr, stop, umls, ptag, qout, p):
		ptxt = {}
		idf = {}
		i = 1
		for d in ldoc:
			if i % 500 == 0:
				log.info (' --- core %d: processed %d documents' % (p,i))
			i += 1
			pdoc = _parse_text (docs[d], ngr, stop, umls, ptag)
			if not pdoc:
				continue
			ptxt[d] = pdoc
			for t in ptxt[d]:
				val = idf.setdefault (t,0)
'''
 Function to Interact with ClinicalTrials.gov

 @author: Riccardo Miotto <rm3086 (at) columbia (dot) edu>
 '''

import re, math
from lib.utility.web import download_web_data
import xml.etree.ElementTree as pxml
from lib.utility.log import strd_logger
from xml.etree import ElementTree
import lib.utility.file as ufile

log = strd_logger('arule-mining')


# list of all trials available
def get_clinical_trials():
    '''
	url = 'http://clinicaltrials.gov/ct2/crawl'
	html = download_web_data (url)
	pages = re.findall (r'href="/ct2/crawl/(\d+)"', html)
	lnct = set()
	print (lnct)
	for p in pages:
		html = None
		while html is None:
			html = download_web_data ('%s/%s' % (url, p))
		ct = re.findall (r'href="/ct2/show/(NCT\d+)"', html)
		lnct |= set(ct)
	'''