Python tokenize 예제들, word_count.tokenize Python 예제들

예제 #1

0

파일 보기

파일: chunker.py 프로젝트: jrwalk/empath

def train_classifier():
	"""Constructs nltk Naive Bayes classifier using labeled medical comments 
	(from Nicole Strang's dataset).

	RETURNS:
		nbc: nltk.classify.NaiveBayesClassifier object.
			Naive Bayes classifier trained on Nicole Strang's labeled patient 
			comments from previous Insight project.  Similar usage (comments 
			on drug side effects) but disjoint from antidepressant data.
	"""
	df = pandas.read_csv(
		'/home/jrwalk/python/empath/data/training/Comments.txt',index_col=0)
	df['Comments'].replace('',np.nan,inplace=True)
	df.dropna(subset=['Comments'],inplace=True)
	df = df.drop_duplicates(subset=['Drug','Comments'])
	df = df[df.Rating != 3]
	df['Value'] = ['pos' if x>3 else 'neg' for x in df['Rating']]

	# so now we have a DataFrame of the training data, including user rating, 
	# side effects, comments, and patient metadata.  Realistically we only need 
	# the rating and comments.
	pos_texts = df['Comments'][df['Value'] == 'pos'].tolist()
	neg_texts = df['Comments'][df['Value'] == 'neg'].tolist()

	pos_data = [(dict([(word,True) for word in tokenize(com,None,False,True)]),'pos') 
		for com in pos_texts]
	neg_data = [(dict([(word,True) for word in tokenize(com,None,False,True)]),'neg') 
		for com in neg_texts]
	trainingdata = pos_data + neg_data

	classifier = nltk.classify.NaiveBayesClassifier.train(trainingdata)
	return classifier

예제 #2

0

파일 보기

def train_classifier():
    """Constructs nltk Naive Bayes classifier using labeled medical comments 
	(from Nicole Strang's dataset).

	RETURNS:
		nbc: nltk.classify.NaiveBayesClassifier object.
			Naive Bayes classifier trained on Nicole Strang's labeled patient 
			comments from previous Insight project.  Similar usage (comments 
			on drug side effects) but disjoint from antidepressant data.
	"""
    df = pandas.read_csv(
        '/home/jrwalk/python/empath/data/training/Comments.txt', index_col=0)
    df['Comments'].replace('', np.nan, inplace=True)
    df.dropna(subset=['Comments'], inplace=True)
    df = df.drop_duplicates(subset=['Drug', 'Comments'])
    df = df[df.Rating != 3]
    df['Value'] = ['pos' if x > 3 else 'neg' for x in df['Rating']]

    # so now we have a DataFrame of the training data, including user rating,
    # side effects, comments, and patient metadata.  Realistically we only need
    # the rating and comments.
    pos_texts = df['Comments'][df['Value'] == 'pos'].tolist()
    neg_texts = df['Comments'][df['Value'] == 'neg'].tolist()

    pos_data = [(dict([(word, True)
                       for word in tokenize(com, None, False, True)]), 'pos')
                for com in pos_texts]
    neg_data = [(dict([(word, True)
                       for word in tokenize(com, None, False, True)]), 'neg')
                for com in neg_texts]
    trainingdata = pos_data + neg_data

    classifier = nltk.classify.NaiveBayesClassifier.train(trainingdata)
    return classifier

예제 #3

0

파일 보기

파일: lda.py 프로젝트: jrwalk/empath

	def streamer():
		for text in texts(drug=drug):
			text = tokenize(text,drug=drug,pos_filter=False)	# list of tokens
			for i,word in enumerate(text):	# remap brand drug names
				remap = _drug_dict.get(word.upper(),None)
				if remap is not None:
					text[i] = remap.lower()
			text = [stemmer.stem(word) for word in text]
			yield text

예제 #4

0

파일 보기

def fix():
    conn = pms.connect(host='localhost',
                       user='******',
                       passwd='',
                       db='empath',
                       charset='utf8',
                       init_command='SET NAMES UTF8')
    cur = conn.cursor()

    # get chunked comment ids
    query = "SELECT c.id,c.body"
    for gen in _generics:
        query += (",m.%s" % gen.lower())
    query += " FROM Comments c "
    query += "JOIN Mentions m on c.id=m.id WHERE c.chunked=True"
    cur.execute(query)

    data = {}
    for row in cur:
        drugs = np.array([uniconvert(d) for d in row[2:]])
        dmap = np.where(drugs == 1)
        drugs = [d.lower() for d in list(np.array(_generics)[dmap])]
        data[row[0]] = (row[1], drugs)

    for post_id in data.keys():
        body, drugs = data[post_id]

        body = body.lower()
        for drug in drugs:
            for remap in _gen_dict.get(drug.upper(), [drug.upper()]):
                body = body.replace(remap.lower(), drug.lower())

        # set preamble order to correct precedence
        query = ("UPDATE Chunks SET precedence=0 WHERE (id='%s' "
                 "AND drug='preamble')" % post_id)
        cur.execute(query)

        # get order of drug mentions
        tokens = tokenize(body, drug=None, pos_filter=False, lemma=False)
        ordered_drugs = []
        for word in tokens:
            if word in drugs:
                ordered_drugs.append(word)
        ordered_drugs = OrderedSet(ordered_drugs)

        for i, drug in enumerate(ordered_drugs):
            query = ("UPDATE Chunks SET precedence=%i WHERE (id='%s' "
                     "AND drug='%s')" % (i + 1, post_id, drug))
            cur.execute(query)

    conn.commit()
    conn.close()

예제 #5

0

파일 보기

파일: tally_drugs.py 프로젝트: jrwalk/empath

def tally():
	"""reads through empath.Comments db, detects which drugs are mentioned 
	in each comment body.  Populates empath.Mentions db with count, which 
	drugs are mentioned.
	"""
	conn = pms.connect(host='localhost',
		user='******',
		passwd='',
		db='empath',
		charset='utf8',
		init_command='SET NAMES UTF8')
	cur = conn.cursor()
	cur.execute('select id,body from Comments')

	posts = []
	for row in cur:
		posts.append((row[0],row[1].upper()))

	for row in posts:
		post_id = row[0]
		body = row[1].upper()

		# remap to generic drug names
		for drug in _drug_dict:
			body = body.replace(drug,_drug_dict[drug])

		tokens = tokenize(body,None,False,False)

		# generate row in `empath`.`Mentions` for post
		# loop through generics, detect presence, update Mentions as needed
		try:
			cur.execute('INSERT INTO `Mentions` (`id`) VALUES (%s)',(post_id))
			conn.commit()
		except:
			pass
		counter = 0
		for drug in _generics:
			if drug.lower() in tokens:
				counter += 1
				flagger = ("UPDATE `Mentions` SET `%s`=True WHERE `id`='%s'" 
					% (drug.lower(),post_id))
				cur.execute(flagger)
		cur.execute("UPDATE `Mentions` SET `count`=%s WHERE `id`='%s'"
			% (counter,post_id))

	conn.commit()
	conn.close()

예제 #6

0

파일 보기

def tokenize_leaves(tree):
    """renders leaves of input tree down into tokenized list, accounting for 
	stopwords, lemmatizing, and punctuation.

	ARGS:
		tree: nltk.tree.Tree object.
			input tree or subtree.

	RETURNS:
		tokens: list.
			list of tokenized words in tree.
	"""
    nested = [tokenize(block, None, False, True) for block in tree.leaves()]
    leaves = []
    for block in nested:
        for word in block:
            leaves.append(word)
    return leaves

예제 #7

0

파일 보기

파일: chunker.py 프로젝트: jrwalk/empath

def tokenize_leaves(tree):
	"""renders leaves of input tree down into tokenized list, accounting for 
	stopwords, lemmatizing, and punctuation.

	ARGS:
		tree: nltk.tree.Tree object.
			input tree or subtree.

	RETURNS:
		tokens: list.
			list of tokenized words in tree.
	"""
	nested = [tokenize(block,None,False,True) for block in tree.leaves()]
	leaves = []
	for block in nested:
		for word in block:
			leaves.append(word)
	return leaves

예제 #8

0

파일 보기

파일: tally_drugs.py 프로젝트: jrwalk/empath

def tally():
    """reads through empath.Comments db, detects which drugs are mentioned 
	in each comment body.  Populates empath.Mentions db with count, which 
	drugs are mentioned.
	"""
    conn = pms.connect(
        host="localhost", user="******", passwd="", db="empath", charset="utf8", init_command="SET NAMES UTF8"
    )
    cur = conn.cursor()
    cur.execute("select id,body from Comments")

    posts = []
    for row in cur:
        posts.append((row[0], row[1].upper()))

    for row in posts:
        post_id = row[0]
        body = row[1].upper()

        # remap to generic drug names
        for drug in _drug_dict:
            body = body.replace(drug, _drug_dict[drug])

        tokens = tokenize(body, None, False, False)

        # generate row in `empath`.`Mentions` for post
        # loop through generics, detect presence, update Mentions as needed
        try:
            cur.execute("INSERT INTO `Mentions` (`id`) VALUES (%s)", (post_id))
            conn.commit()
        except:
            pass
        counter = 0
        for drug in _generics:
            if drug.lower() in tokens:
                counter += 1
                flagger = "UPDATE `Mentions` SET `%s`=True WHERE `id`='%s'" % (drug.lower(), post_id)
                cur.execute(flagger)
        cur.execute("UPDATE `Mentions` SET `count`=%s WHERE `id`='%s'" % (counter, post_id))

    conn.commit()
    conn.close()