def train_classifier(): """Constructs nltk Naive Bayes classifier using labeled medical comments (from Nicole Strang's dataset). RETURNS: nbc: nltk.classify.NaiveBayesClassifier object. Naive Bayes classifier trained on Nicole Strang's labeled patient comments from previous Insight project. Similar usage (comments on drug side effects) but disjoint from antidepressant data. """ df = pandas.read_csv( '/home/jrwalk/python/empath/data/training/Comments.txt',index_col=0) df['Comments'].replace('',np.nan,inplace=True) df.dropna(subset=['Comments'],inplace=True) df = df.drop_duplicates(subset=['Drug','Comments']) df = df[df.Rating != 3] df['Value'] = ['pos' if x>3 else 'neg' for x in df['Rating']] # so now we have a DataFrame of the training data, including user rating, # side effects, comments, and patient metadata. Realistically we only need # the rating and comments. pos_texts = df['Comments'][df['Value'] == 'pos'].tolist() neg_texts = df['Comments'][df['Value'] == 'neg'].tolist() pos_data = [(dict([(word,True) for word in tokenize(com,None,False,True)]),'pos') for com in pos_texts] neg_data = [(dict([(word,True) for word in tokenize(com,None,False,True)]),'neg') for com in neg_texts] trainingdata = pos_data + neg_data classifier = nltk.classify.NaiveBayesClassifier.train(trainingdata) return classifier
def train_classifier(): """Constructs nltk Naive Bayes classifier using labeled medical comments (from Nicole Strang's dataset). RETURNS: nbc: nltk.classify.NaiveBayesClassifier object. Naive Bayes classifier trained on Nicole Strang's labeled patient comments from previous Insight project. Similar usage (comments on drug side effects) but disjoint from antidepressant data. """ df = pandas.read_csv( '/home/jrwalk/python/empath/data/training/Comments.txt', index_col=0) df['Comments'].replace('', np.nan, inplace=True) df.dropna(subset=['Comments'], inplace=True) df = df.drop_duplicates(subset=['Drug', 'Comments']) df = df[df.Rating != 3] df['Value'] = ['pos' if x > 3 else 'neg' for x in df['Rating']] # so now we have a DataFrame of the training data, including user rating, # side effects, comments, and patient metadata. Realistically we only need # the rating and comments. pos_texts = df['Comments'][df['Value'] == 'pos'].tolist() neg_texts = df['Comments'][df['Value'] == 'neg'].tolist() pos_data = [(dict([(word, True) for word in tokenize(com, None, False, True)]), 'pos') for com in pos_texts] neg_data = [(dict([(word, True) for word in tokenize(com, None, False, True)]), 'neg') for com in neg_texts] trainingdata = pos_data + neg_data classifier = nltk.classify.NaiveBayesClassifier.train(trainingdata) return classifier
def streamer(): for text in texts(drug=drug): text = tokenize(text,drug=drug,pos_filter=False) # list of tokens for i,word in enumerate(text): # remap brand drug names remap = _drug_dict.get(word.upper(),None) if remap is not None: text[i] = remap.lower() text = [stemmer.stem(word) for word in text] yield text
def fix(): conn = pms.connect(host='localhost', user='******', passwd='', db='empath', charset='utf8', init_command='SET NAMES UTF8') cur = conn.cursor() # get chunked comment ids query = "SELECT c.id,c.body" for gen in _generics: query += (",m.%s" % gen.lower()) query += " FROM Comments c " query += "JOIN Mentions m on c.id=m.id WHERE c.chunked=True" cur.execute(query) data = {} for row in cur: drugs = np.array([uniconvert(d) for d in row[2:]]) dmap = np.where(drugs == 1) drugs = [d.lower() for d in list(np.array(_generics)[dmap])] data[row[0]] = (row[1], drugs) for post_id in data.keys(): body, drugs = data[post_id] body = body.lower() for drug in drugs: for remap in _gen_dict.get(drug.upper(), [drug.upper()]): body = body.replace(remap.lower(), drug.lower()) # set preamble order to correct precedence query = ("UPDATE Chunks SET precedence=0 WHERE (id='%s' " "AND drug='preamble')" % post_id) cur.execute(query) # get order of drug mentions tokens = tokenize(body, drug=None, pos_filter=False, lemma=False) ordered_drugs = [] for word in tokens: if word in drugs: ordered_drugs.append(word) ordered_drugs = OrderedSet(ordered_drugs) for i, drug in enumerate(ordered_drugs): query = ("UPDATE Chunks SET precedence=%i WHERE (id='%s' " "AND drug='%s')" % (i + 1, post_id, drug)) cur.execute(query) conn.commit() conn.close()
def tally(): """reads through empath.Comments db, detects which drugs are mentioned in each comment body. Populates empath.Mentions db with count, which drugs are mentioned. """ conn = pms.connect(host='localhost', user='******', passwd='', db='empath', charset='utf8', init_command='SET NAMES UTF8') cur = conn.cursor() cur.execute('select id,body from Comments') posts = [] for row in cur: posts.append((row[0],row[1].upper())) for row in posts: post_id = row[0] body = row[1].upper() # remap to generic drug names for drug in _drug_dict: body = body.replace(drug,_drug_dict[drug]) tokens = tokenize(body,None,False,False) # generate row in `empath`.`Mentions` for post # loop through generics, detect presence, update Mentions as needed try: cur.execute('INSERT INTO `Mentions` (`id`) VALUES (%s)',(post_id)) conn.commit() except: pass counter = 0 for drug in _generics: if drug.lower() in tokens: counter += 1 flagger = ("UPDATE `Mentions` SET `%s`=True WHERE `id`='%s'" % (drug.lower(),post_id)) cur.execute(flagger) cur.execute("UPDATE `Mentions` SET `count`=%s WHERE `id`='%s'" % (counter,post_id)) conn.commit() conn.close()
def tokenize_leaves(tree): """renders leaves of input tree down into tokenized list, accounting for stopwords, lemmatizing, and punctuation. ARGS: tree: nltk.tree.Tree object. input tree or subtree. RETURNS: tokens: list. list of tokenized words in tree. """ nested = [tokenize(block, None, False, True) for block in tree.leaves()] leaves = [] for block in nested: for word in block: leaves.append(word) return leaves
def tokenize_leaves(tree): """renders leaves of input tree down into tokenized list, accounting for stopwords, lemmatizing, and punctuation. ARGS: tree: nltk.tree.Tree object. input tree or subtree. RETURNS: tokens: list. list of tokenized words in tree. """ nested = [tokenize(block,None,False,True) for block in tree.leaves()] leaves = [] for block in nested: for word in block: leaves.append(word) return leaves
def tally(): """reads through empath.Comments db, detects which drugs are mentioned in each comment body. Populates empath.Mentions db with count, which drugs are mentioned. """ conn = pms.connect( host="localhost", user="******", passwd="", db="empath", charset="utf8", init_command="SET NAMES UTF8" ) cur = conn.cursor() cur.execute("select id,body from Comments") posts = [] for row in cur: posts.append((row[0], row[1].upper())) for row in posts: post_id = row[0] body = row[1].upper() # remap to generic drug names for drug in _drug_dict: body = body.replace(drug, _drug_dict[drug]) tokens = tokenize(body, None, False, False) # generate row in `empath`.`Mentions` for post # loop through generics, detect presence, update Mentions as needed try: cur.execute("INSERT INTO `Mentions` (`id`) VALUES (%s)", (post_id)) conn.commit() except: pass counter = 0 for drug in _generics: if drug.lower() in tokens: counter += 1 flagger = "UPDATE `Mentions` SET `%s`=True WHERE `id`='%s'" % (drug.lower(), post_id) cur.execute(flagger) cur.execute("UPDATE `Mentions` SET `count`=%s WHERE `id`='%s'" % (counter, post_id)) conn.commit() conn.close()