def filter_string(self, file): stop_word = stop_words() list1 = [] finger_string = "" for line in file: list1.extend(line.split(" ")) if len(list1) != 0: for word in list1: if word not in stop_word: word = word.strip("\n") word_list = list(word) word_list2 = [] for i in word_list: if (ord(i) >= 65 and ord(i) <= 90 or ord(i) >= 97 and ord(i) <= 122 or ord(i) >= 48 and ord(i) <= 57 and i == "_"): word_list2.append(i) word1 = ''.join(word_list2) finger_string += word1.lower() return (finger_string) else: return None
def filter_string(self, file): ''' This function takes an opened file as input, For each line in the file, it removes special characters, stop words and spaces Appends the left over characters all into one string''' stop_word = stop_words( ) # A function that returns all the frequently used words list1 = [] finger_string = "" for line in file: list1.extend(line.split(" ")) if len(list1) != 0: for word in list1: if word not in stop_word: word = word.strip("\n") word_list = list(word) word_list2 = [] for i in word_list: if (ord(i) >= 65 and ord(i) <= 90 or ord(i) >= 97 and ord(i) <= 122 or ord(i) >= 48 and ord(i) <= 57 and i == "_"): word_list2.append(i) word1 = ''.join(word_list2) finger_string += word1.lower() return (finger_string) else: return None
def rm_stop_words(self, tokens): s = stop_words.stop_words() sw = s.stop_words result_tokens = [] for token_one in tokens: if token_one not in sw: result_tokens.append(token_one) return result_tokens
def vocab_filter(url,pubs,min_shared_vocab_size=2,vocab_use_pct=1.0): """ Determine which publications are valid based on the vocabulary provided by the URL (which is assumed to represent the field and publication topics of the person named). """ accepted_pubs = [] # read the URL #print 'Obtaining URL data...' url_words = None if url.endswith('.pdf'): url_words = __read_pdf_text(url) else: fh = urlopen(url) url_content = '\n'.join(fh.readlines()) url_words = set(map(lambda x: x.lower(), re.findall('[A-Za-z-]+',url_content))) swords = stop_words() url_words.difference_update(swords) # if we're only supposed to use some of the URL words, then subsample as appropriate. num_words_to_remove = int(math.ceil(float(len(url_words)) * (1.0 - vocab_use_pct))) for i in range(num_words_to_remove): url_words.pop() ###### # Filter the publications #print 'Filtering pubications...' # filter based on name accepted_pubs = [] for pub in pubs: # check word content pub_words = set(map(lambda x: x.lower(), re.findall('[A-Za-z-]+',pub.title))) #.union(set(map(lambda x: x.lower(), re.findall('\w+',pub.source)))) pub_words.difference_update(swords) shared_words = url_words.intersection(pub_words) if len(shared_words) < min_shared_vocab_size: continue #print pub.title,shared_words # if we got here, then the publication is ok! accepted_pubs.append(pub) return accepted_pubs
def tokenize(string): # Tokenizes the string tokens = string.split(" ") # Removes tokens with wordlenght < 3 new_tokens = [] for token in tokens: if len(token) > 2: new_tokens.append(token) # Remove stopwords sw = stop_words() word_tokens = new_tokens filtered_tokens = [] for w in word_tokens: if w not in sw: filtered_tokens.append(w) return filtered_tokens
def build_chunks(drug, classifier, limit=None): """Pulls comment data from SQL table, constructs trees for each, chunks by drug mention, writes to Chunks SQL table organized by drug. ARGS: drug: string. drug name. classifier: nltk.classify.NaiveBayesClassifier object. trained Naive Bayes classifier. KWARGS: limit: int or None. optional cap on number of comments streamed through processor. RAISES: ValueError: if invalid drug is input. """ try: drug = _drug_dict[drug.upper()] except: raise ValueError("invalid drug") def uniconvert(s): if s == '\x00': return 0 elif s == '\x01': return 1 else: return None conn = pms.connect(host='localhost', user='******', passwd='', db='empath', charset='utf8', init_command='SET NAMES UTF8') cur = conn.cursor() # assemble the mother of all queries query = "SELECT c.id,c.body,m.count" for gen in _generics: query += (",m.%s" % gen.lower()) query += " FROM Comments c JOIN Subreddits s on c.subreddit=s.subreddit " query += "JOIN Mentions m on c.id=m.id WHERE (m.count=1 OR m.count=2) " query += ("AND m.%s=True AND c.chunked=False" % drug.lower()) if limit is not None: query += (" LIMIT %s" % limit) cur.execute(query) conn.close() for row in cur: post_id = row[0] body = row[1] count = row[2] drugs = np.array([uniconvert(d) for d in row[3:]]) dmap = np.where(drugs == 1) drugs = [d.lower() for d in list(np.array(_generics)[dmap])] # clean body text body = body.lower() for drug in drugs: for remap in _gen_dict.get(drug.upper(), [drug.upper()]): body = body.replace(remap.lower(), drug.lower()) trees, sentiments = build_tree(body, drugs) subtexts, mentions, precedence = map_subtrees(trees, drugs) for i, drug in enumerate(OrderedSet(precedence)): drugtext = [] for subtext in subtexts[drug]: for word in subtext: drugtext.append(word) drugtext = [ word for word in drugtext if word not in set(stop_words()) ] sents = [] for j, men in enumerate(mentions): if len(men) == 0: men = ['preamble'] if drug in men: sents.append(sentiments[j]) nbsent = classifier.prob_classify( dict([(word, True) for word in drugtext ])).prob('pos') # probability positive data = (post_id, i, drug, drugtext, sents, nbsent) yield data
from authorstats import compute_individual_stats, obtain_individual_pubs import csv from oryx.env import rb import math from pubfilter import * import stop_words import pylab as pl from pubmodel import Publication import pubstats swords = stop_words.stop_words() YEAR_FILTER = 'YEAR' VOCAB_FILTER = 'VOCAB' NAME_FILTER = 'NAME' CONFLICT_FILTER = 'CONFLICT' DUPLICATE_FILTER = 'DUPLICATE' BEST_FILTERS = [YEAR_FILTER,VOCAB_FILTER,NAME_FILTER,DUPLICATE_FILTER] def compute_test_case_stats(name,url,dfile,filters=BEST_FILTERS,min_vocab_match_size=2,vocab_use_pct=1.0): """ Results: # of total pubs found,# of found pubs,# true pubs,TP: # of matching pubs,FP, FN: # of unaccepted matching pubs """ use_initials = False if name.startswith('^'): use_initials = True name = name[1:] # load the true pubs
def tokenize(text,drug=None,pos_filter=False,lemma=True): """Simple (or not) tokenizer for given text block. ARGS: text: string. Single comment block. KWARGS: drug: string or None. drug name (added to stoplist to prevent self-mentions) pos_filter: boolean. set True to use part-of-speech filtering. lemma: boolean. set True to use lemmatization. RETURNS: words: list. List of lower-case word tokens (individual strings) """ tokens = nltk.RegexpTokenizer(r'\w+').tokenize(text.lower()) merger = nltk.MWETokenizer([('side','effect'),('side','effects')]) tokens = merger.tokenize(tokens) # filter on stop words stops = sw.stop_words() if drug is not None: if drug.upper() != 'ANTIDEPRESSANT': stops.append(drug.lower()) if _drug_dict[drug.upper()] != drug.upper(): stops.append(_drug_dict[drug.upper()].lower()) if drug.upper() in _gen_dict.keys(): for bd in _gen_dict[drug.upper()]: stops.append(bd.lower()) else: stops = stops+['antidepressant','antidepressants'] stops = set(stops) tokens = [word for word in tokens if word not in stops] if pos_filter: tagged_tokens = nltk.pos_tag(tokens) tags = ['CD', 'DT', 'JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNPS', 'NNS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] tokens = [word for (word,tag) in tagged_tokens if tag in tags] if lemma: tokens = [_lemmatizer.lemmatize(word,pos='v') for word in tokens] tokens = [_lemmatizer.lemmatize(word,pos='n') for word in tokens] # one more pass through stopword filter tokens = [word for word in tokens if word not in stops] return tokens
def build_chunks(drug,classifier,limit=None): """Pulls comment data from SQL table, constructs trees for each, chunks by drug mention, writes to Chunks SQL table organized by drug. ARGS: drug: string. drug name. classifier: nltk.classify.NaiveBayesClassifier object. trained Naive Bayes classifier. KWARGS: limit: int or None. optional cap on number of comments streamed through processor. RAISES: ValueError: if invalid drug is input. """ try: drug = _drug_dict[drug.upper()] except: raise ValueError("invalid drug") def uniconvert(s): if s == '\x00': return 0 elif s == '\x01': return 1 else: return None conn = pms.connect(host='localhost', user='******', passwd='', db='empath', charset='utf8', init_command='SET NAMES UTF8') cur = conn.cursor() # assemble the mother of all queries query = "SELECT c.id,c.body,m.count" for gen in _generics: query += (",m.%s" % gen.lower()) query += " FROM Comments c JOIN Subreddits s on c.subreddit=s.subreddit " query += "JOIN Mentions m on c.id=m.id WHERE (m.count=1 OR m.count=2) " query += ("AND m.%s=True AND c.chunked=False" % drug.lower()) if limit is not None: query += (" LIMIT %s" % limit) cur.execute(query) conn.close() for row in cur: post_id = row[0] body = row[1] count = row[2] drugs = np.array([uniconvert(d) for d in row[3:]]) dmap = np.where(drugs == 1) drugs = [d.lower() for d in list(np.array(_generics)[dmap])] # clean body text body = body.lower() for drug in drugs: for remap in _gen_dict.get(drug.upper(),[drug.upper()]): body = body.replace(remap.lower(),drug.lower()) trees,sentiments = build_tree(body,drugs) subtexts,mentions,precedence = map_subtrees(trees,drugs) for i,drug in enumerate(OrderedSet(precedence)): drugtext = [] for subtext in subtexts[drug]: for word in subtext: drugtext.append(word) drugtext = [word for word in drugtext if word not in set(stop_words())] sents = [] for j,men in enumerate(mentions): if len(men) == 0: men = ['preamble'] if drug in men: sents.append(sentiments[j]) nbsent = classifier.prob_classify(dict([(word,True) for word in drugtext])).prob('pos') # probability positive data = (post_id,i,drug,drugtext,sents,nbsent) yield data