예제 #1
0
파일: lda.py 프로젝트: jrwalk/empath
	def streamer():
		for text in texts(drug=drug):
			text = tokenize(text,drug=drug,pos_filter=False)	# list of tokens
			for i,word in enumerate(text):	# remap brand drug names
				remap = _drug_dict.get(word.upper(),None)
				if remap is not None:
					text[i] = remap.lower()
			text = [stemmer.stem(word) for word in text]
			yield text
예제 #2
0
파일: word_count.py 프로젝트: jrwalk/empath
def word_count(drug=None,limit=None,pos_filter=False,lemma=True):
	"""Scans comment texts (from drug_mentions.texts) for selected drug, 
	calculates most common words.

	KWARGS:
		drug: string or None.
			Drug selector.  Allows three cases:
			* None: scrape all comments in database, regardless of drug.
			* 'antidepressant': select comments speaking generically about
				drug, not referencing specific drug.
			* [drug name]: comments referencing specific drug.
			Default None.  Passed to drug_mentions.texts.
		limit: int or None.
			Optional limit on SQL queries retrieved by drug_mentions.texts. 
			Defaults to None (returns all hits).
		pos_filter: boolean.
			Passed to tokenize(), set True to use part-of-speech filtering.
		lemma: boolean.
			Passed to tokenize(), set True to use lemmatization.

	RETURNS:
		freq: nltk.probability.FreqDist object.
			Frequency distribution of words from comments.

	RAISES:
		ValueError:
			for invalid drug name.
	"""
	try:
		texts = dm.texts(drug=drug,limit=limit)
	except ValueError:
		raise ValueError('Invalid drug name.')

	freq = FreqDist()
	for text in texts:
		freq.update(tokenize(text,drug,pos_filter=pos_filter,lemma=lemma))

	return freq