예제 #1
0
	def eligibility(self):
		if self._eligibility is None:
			elig_obj = self.doc.get('_eligibility_obj')
			self._eligibility = EligibilityCriteria(elig_obj)
			
			# no object yet, parse from JSON
			if elig_obj is None and self.doc:
				self._eligibility.load_lilly_json(self.doc.get('eligibility'))
				self.doc['_eligibility_obj'] = self._eligibility.doc
				self.store({'_eligibility_obj': self._eligibility.doc})
		
		return self._eligibility
예제 #2
0
class Trial (MNGObject):
	""" Describes a trial found on ClinicalTrials.gov.
	"""
	
	collection_name = 'studies'
	
	def __init__(self, nct=None):
		super(Trial, self).__init__(nct)
		self._title = None
		self.papers = None
		
		# eligibility & analyzables
		self._eligibility = None
		self.analyze_keypaths = None
		self._analyzables = None
		
		# NLP
		self.nlp = None
		self.waiting_for_ctakes_pmc = False
	
	
	# -------------------------------------------------------------------------- Properties
	@property
	def nct(self):
		return self.id
	
	@property
	def title(self):
		""" Construct the best title possible. """
		if not self._title:
			if not self.loaded:
				self.load()
			
			if self.doc is None:
				return 'Unknown Title'
			
			# we have a document, create the title
			title = self.doc.get('official_title')
			if not title:
				title = self.doc.get('brief_title')
			acronym = self.doc.get('acronym')
			if acronym:
				if title:
					title = "%s: %s" % (acronym, title)
				else:
					title = acronym
			self._title = title
		
		return self._title
			
	@property
	def entered(self):
		""" How many years ago was the trial entered into ClinicalTrials.gov. """
		now = datetime.datetime.now()
		first = self.date('firstreceived_date')
		return round((now - first[1]).days / 365.25 * 10) / 10 if first[1] else None
		
	@property
	def last_updated(self):
		""" How many years ago was the trial last updated. """
		now = datetime.datetime.now()
		last = self.date('lastchanged_date')
		return round((now - last[1]).days / 365.25 * 10) / 10 if last[1] else None
	
	@property
	def eligibility_inclusion(self):
		return self.eligibility.inclusion_text
	
	@property
	def eligibility_exclusion(self):
		return self.eligibility.exclusion_text
	
	@property
	def intervention_types(self):
		""" Returns a set of intervention types of the receiver. """
		types = set()
		for intervent in self.intervention:
			inter_type = intervent.get('intervention_type')
			if inter_type:
				types.add(inter_type)
		
		if 0 == len(types):
			types.add('Observational')
		
		return types
	
	@property
	def trial_phases(self):
		""" Returns a set of phases in drug trials.
		Non-drug trials might still declare trial phases, we don't filter those.
		"""
		my_phases = self.phase
		if my_phases and 'N/A' != my_phases:
			phases = set(my_phases.split('/'))
		else:
			phases = set(['N/A'])
		
		return phases

	def __getattr__(self, name):
		""" As last resort, we forward calls to non-existing properties to our
		document. """
		
		if not self.loaded:
			self.load()
		
		if self.doc:
			return self.doc.get(name)
		raise AttributeError
	
	
	def date(self, dt):
		""" Returns a tuple of the string date and the parsed Date object for
		the requested JSON object. """
		dateval = None
		parsed = None
		
		if dt is not None:
			date_dict = self.doc.get(dt) if self.doc else None
			if type(date_dict) is dict:
				dateval = date_dict.get('value')
				
				# got it, parse
				if dateval:
					dateregex = re.compile('(\w+)\s+((\d+),\s+)?(\d+)')
					searched = dateregex.search(dateval)
					match = searched.groups() if searched is not None else []
					
					# convert it to almost-ISO-8601. If day is missing use 28 to not crash the parser for February
					fmt = "%s-%s-%s" % (match[3], str(match[0])[0:3], str('00' + match[2])[-2:] if match[2] else 28)
					parsed = dateutil.parser.parse(fmt)
		
		return (dateval, parsed)
	
	
	def update_from_lilly(self, json):
		""" Incoming JSON from Lilly; for efficiency we drop all content
		except keys starting with an underscore. Faster than deepUpdate, which
		usually just replaces everything from Lilly's JSON anyway. """
		
		if json is None:
			return
		
		if self.id is None:
			self.id = json.get('id')
		
		if not self.loaded:
			self.load()
		
		if self.doc is not None:
			for key, val in self.doc.iteritems():
				if '_' == key[:1]:
					json[key] = val
		
		self.replace_with(json)
	
	
	def did_update_doc(self):
		""" We may need to fix some keywords. """
		if 'keyword' in self.doc:
			self.doc['keyword'] = self.cleanup_keywords(self.doc['keyword'])
	
	
	def json(self, extra_fields=['brief_summary']):
		""" Returns a JSON-ready representation.
		There is a standard set of fields and the fields stated in
		"extra_fields" will be appended.
		"""
		
		# main dict
		d = {
			'nct': self.id,
			'title': self.title,
		}
		
		# add extra fields
		if self.doc is not None:
			for fld in extra_fields:
				d[fld] = getattr(self, fld)
		elif extra_fields is not None and len(extra_fields) > 0:
			logging.debug("Requesting extra fields %s but don't have a document" % extra_fields)
		
		return d
	
	def report_row(self):
		""" Generates an HTML row for the report_row document.
		"""
		return self.eligibility.report_row()
	
	
	# -------------------------------------------------------------------------- PubMed
	def run_pmc(self, run_dir):
		""" Finds, downloads, extracts and parses PMC-indexed publications for
		the trial. """
		self.find_pmc_packages()
		self.download_pmc_packages(run_dir)
		self.parse_pmc_packages(run_dir)
	
	
	def find_pmc_packages(self):
		""" Determine whether there was a PMC-indexed publication for the trial.
		"""
		if self.nct is None:
			logging.warning("Need an NCT before trying to find publications")
			return
		
		# find paper details
		self.papers = Paper.find_by_nct(self.nct)
		for paper in self.papers:
			paper.fetch_pmc_ids()
	
	
	def download_pmc_packages(self, run_dir):
		""" Downloads the PubMed Central packages for our papers. """
		
		if self.papers is not None:
			for paper in self.papers:
				paper.download_pmc_packages(run_dir)
	
	
	def parse_pmc_packages(self, run_dir):
		""" Looks for downloaded packages in the given run directory and
		extracts the paper text from the XML in the .nxml file.
		"""
		if self.papers is None:
			return
		
		import os.path
		if not os.path.exists(run_dir):
			raise Exception("The run directory %s doesn't exist" % run_dir)
		
		import codecs
		ct_in_dir = os.path.join(Trial.ctakes.get('root', run_dir), 'ctakes_input')
		for paper in self.papers:
			paper.parse_pmc_packages(run_dir, ct_in_dir)
			
			# also dump CT criteria if the paper has methods
			if paper.has_methods:
				plaintextpath = os.path.join(ct_in_dir, "%s-%s-CT.txt" % (self.nct, paper.pmid))
				with codecs.open(plaintextpath, 'w', 'utf-8') as handle:
					handle.write(self.eligibility.formatted())
				
				self.waiting_for_ctakes_pmc = True
	
	
	# -------------------------------------------------------------------------- Persistence
	def codified_properties(self):
		""" Returns all codified properties. """
		return self.doc.get('_codified') if self.doc else None
	
	def load_codified_property(self, prop, nlp_name=None):
		""" Checks if the given property has been codified by the given NLP
		engine and loads the codes if so.
		If no NLP name is given returns all existing ones. """
		if not self.loaded:
			self.load()
		
		codifieds = self.doc.get('_codified')
		cod_all = codifieds.get(prop) if codifieds else None
		if nlp_name is None:
			return cod_all
		
		return cod_all.get(nlp_name) if cod_all else None
	
	def store_codified_property(self, prop, codes, nlp_name):
		""" Stores the codes generated by the named NLP pipeline for the given
		property. """
		
		# store partial
		if codes and len(codes) > 0:
			key = '_codified.%s.%s' % (prop, nlp_name)
			self.store({key: codes})
	
	
	# -------------------------------------------------------------------------- Eligibility Criteria
	@property
	def eligibility(self):
		if self._eligibility is None:
			elig_obj = self.doc.get('_eligibility_obj')
			self._eligibility = EligibilityCriteria(elig_obj)
			
			# no object yet, parse from JSON
			if elig_obj is None and self.doc:
				self._eligibility.load_lilly_json(self.doc.get('eligibility'))
				self.doc['_eligibility_obj'] = self._eligibility.doc
				self.store({'_eligibility_obj': self._eligibility.doc})
		
		return self._eligibility
	
	
	# -------------------------------------------------------------------------- NLP
	def codify_analyzable(self, keypath, nlp_pipelines, force=False):
		""" Take care of codifying the given keypath using an analyzable.
		This method will be called before the NLP pipeline(s) are being run and
		might be run again afterwards, if trials have been waiting for the NLP
		pipeline to complete. """
		
		# make sure we know about this keypath
		if self.analyze_keypaths is None:
			self.analyze_keypaths = [keypath]
		elif keypath not in self.analyze_keypaths:
			self.analyze_keypaths.append(keypath)
		
		self._codify_analyzable(keypath, nlp_pipelines, force)
	
	def _codify_analyzable(self, keypath, nlp_pipelines, force=False):
		""" Use internally. """
		if keypath is None:
			raise Exception("You must provide a keypath to 'codify_analyzable'")
		
		# get Analyzable object
		if self._analyzables is None:
			self._analyzables = {}
		
		if keypath not in self._analyzables:
			analyzable = Analyzable(self, keypath)
			self._analyzables[keypath] = analyzable
			
			# load from db
			stored = self.load_codified_property(keypath)
			if stored is not None:
				analyzable.codified = stored
		else:
			analyzable = self._analyzables[keypath]
		
		# codify (if needed) and store
		newly_stored = analyzable.codify(nlp_pipelines, force)
		if newly_stored:
			for nlp, content in newly_stored.iteritems():
				self.store_codified_property(keypath, content, nlp)
	
	def codify_analyzables(self, nlp_pipelines, force=False):
		""" Codifies all analyzables that the receiver knows about. """
		if self.analyze_keypaths is None:
			return
		
		for keypath in self.analyze_keypaths:
			self._codify_analyzable(keypath, nlp_pipelines, force)
	
	def analyzable_results(self):
		""" Returns codified results for our analyzables, with the following
		hierarchy:
		{ property: { nlp_name: { date: <date>, codes: { type: [#, #, ...] } } } }
		"""
		if not self._analyzables:
			return None
		
		d = {}
		for prop, analyzable in self._analyzables.iteritems():
			d[prop] = analyzable.codified
		return d
	
	
	def waiting_for_nlp(self, check_pipelines):
		""" Returns a set of NLP names if any of our criteria needs to run
		through that NLP pipeline.
		"""
		s = set()
		
		for n in check_pipelines:
			if 'ctakes' == n.name and self.waiting_for_ctakes_pmc:
				s.add(n.name)
			elif self._analyzables:
				for prop, analyzable in self._analyzables.iteritems():
					if analyzable.waiting_for_nlp(n.name):
						s.add(n.name)
						break
		
		return s
	
	
	def filter_snomed(self, exclusion_codes):
		""" Returns the SNOMED code if the trial should be filtered, None
		otherwise. """
		
		if self.eligibility is None:
			return None
		
		return self.eligibility.exclude_by_snomed(exclusion_codes)
	
	
	# -------------------------------------------------------------------------- Trial Locations
	def locations_closest_to(self, lat, lng, limit=0, open_only=True):
		""" Returns a list of tuples, containing the trial location and their
		distance to the provided latitude and longitude.
		If limit is > 0 then only the closest x locations are being returned.
		If open_only is True, only (not yet) recruiting locations are
		considered.
		"""
		closest = []
		
		# get all distances (must be instantiated, are not being cached)
		if self.location is not None:
			for loc_json in self.location:
				loc = TrialLocation(self, loc_json)
				
				if not open_only \
					or 'Recruiting' == loc.status \
					or 'Not yet recruiting' == loc.status \
					or 'Enrolling by invitation' == loc.status:
					
					closest.append((loc, loc.km_distance_from(lat, lng)))
		
		# sort and truncate
		closest.sort(key=lambda tup: tup[1])
		
		if limit > 0 and len(closest) > limit:
			closest = closest[0:limit]
		
		return closest
	
	
	# -------------------------------------------------------------------------- Keywords
	def cleanup_keywords(self, keywords):
		""" Cleanup keywords. """
		better = []
		re_split = re.compile(r';\s+')		# would be nice to also split on comma, but some ppl use it
											# intentionally in tags (like "arthritis, rheumatoid")
		re_sub = re.compile(r'[,\.]+\s*$')
		for keyword in keywords:
			for kw in re_split.split(keyword):
				if kw and len(kw) > 0:
					kw = re_sub.sub('', kw)
					better.append(kw)
		
		return better
	
	
	# -------------------------------------------------------------------------- Utilities
	def __unicode__(self):
		return '<trial.Trial %s>' % (self.id)
	
	def __str__(self):
		return unicode(self).encode('utf-8')
	
	def __repr__(self):
		return str(self)