def make_lookup(): tags = [] for doc in tagColl.find(): for c in doc["concepts"]: tags.append(c["text"]) tags = [t.lower() for t in list(set(tags))] lookup = {} for t in tags: lookup[t.lower()] = [] lookup[stem(t.lower(), stemmer=PORTER)] = [] for doc in tagColl.find(): for c in doc["concepts"]: lookup[c["text"].lower()].append([str(doc["_id"]), c["relevance"]]) lookup[stem(c["text"].lower(), stemmer=PORTER)].append([str(doc["_id"]), c["relevance"]]) for k in lookup.keys(): lookup[k] = sorted(lookup[k], key=lambda x: x[1]) lookup[k] = [x[0] for x in lookup[k]] with open('hst_lookup.json', 'w') as outfile: json.dump(lookup, outfile) return lookup
def roots_and_lemmas(): print(stem('cars', PORTER)) #Root print(stem('cars', LEMMA)) print(stem('studies', PORTER)) # Root print(stem('studies', LEMMA)) text = "People who teach find teaching very rewarding." tokens = words(text) print(count(tokens, stopwords=True, stemmer=PORTER)) print(count(tokens, stopwords=True, stemmer=LEMMA))
def features(message): singlegrams = [i for i in message.split() if i not in stop]#Removingstopwords singlegramsrefined = [] #Stemming the single words for k in singlegrams: r = stem(k, stemmer=LEMMA) singlegramsrefined.append(r) newmessage = " ".join(singlegramsrefined) newmessage = re.sub("[^A-Za-z]", " ", newmessage)# Removing numbers newmessage = re.sub(r'[^\w]', ' ', newmessage)# Removing stopwords singlegrams= [i for i in newmessage.split()] singlegramsrefined2 = [] for word in singlegrams: singlegramsrefined2.append(word) bigrams = ngrams(newmessage, n=2)#bigrams trigrams = ngrams(newmessage, n=3)#trigrams totalgrams = singlegramsrefined2 + bigrams + trigrams totalgrams = tuple(totalgrams)#tuple having single words, bigrams and trigrams return totalgrams
def features(message): #List of nltk stopwords stop = [u'i','diabetes','diabetic','type 2 diabetes','type 2', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now','m'] singlegrams = [i for i in message.split() if i not in stop]#Removingstopwords singlegramsrefined = [] #Stemming the single words for k in singlegrams: r = stem(k, stemmer=LEMMA) if r not in stop: singlegramsrefined.append(r) newmessage = " ".join(singlegramsrefined) newmessage = re.sub("[^A-Za-z]", " ", newmessage)# Removing numbers newmessage = re.sub(r'[^\w]', ' ', newmessage)# Removing non alphanumerics singlegrams= [i for i in newmessage.split() if len(i) > 1] singlegramsrefined2 = [] for word in singlegrams: singlegramsrefined2.append(word) bigrams = ngrams(newmessage, n=2)#bigrams trigrams = ngrams(newmessage, n=3)#trigrams v = parsetree(newmessage, lemmata=True)[0] v = [w.lemma for w in v if w.tag.startswith(('NN'))] singlewords = [] for i in v: stopping = stop +[u'hour',u'husband',u'anything',u'thing',u'way',u'n',u'number',u'person',u'd',u'x',u'dose',u'drug',u'today',u'help',u'everyone',u'bed',u'mine',u'bed',u'issue',u'anyone',u'thank' ,u'test', u'eat',u'something',u'doc',u'time',u'c',u'luck',u'lb',u'dr',u'morning','t',u'pill',u'upset',u'take',u'couple',u'month',u'use',u'exercise',u'diet',u'lot',u'vision','taking',u've',u'time',u'month',u'level',u'body',u'diet',u'food',u'release', u'time', u'meal',u'glipizide',u'week', 'type','yr',u'symptom',u'cause',u'tablet',u'blood',u'feel',u'like', u'made',u'bad',u'work',u'still', u'got',u'twice',u'i',u'mg',u'm',u'day', u'sugar',u'taking',u'doctor',u'get',u'year', u'side',u'went',u'med',u'one',u'better', u'effect',u'problyear',u'side',u'went',u'med',u'one',u'better',u'effect',u'problem',u'also'] if i not in stopping: singlewords.append(i) bi = [] for r in bigrams: if r not in [(u'year', u'now'),(u'also', u'take'),(u'doesn', u't') ,(u'take', u'food'),(u'taking', u'metformin'),(u'i', u'diagnosed'),(u'metformin', u'mg'),(u'empty', u'stomach'),(u'couldn', u't'),(u'blood', u'sugar'),(u'diet', u'exercise'),(u'mg', u'x'),(u'type', u'diabetes'),(u'side', u'effect'),(u'i', u'm'),(u'i', u've'),(u'twice', u'day'), (u'a', u'c'),(u'don', u't'),(u'slow', u'release'),(u't', u'take'),(u't', u'take'), (u'good', u'luck'),(u'didn', u't'),(u'mg', u'twice'),(u'take', u'metformin'),(u'time', u'day'), (u'went', u'away'),(u'year', u'ago'),(u'much', u'better'),(u'extended', u'release'),(u'started', u'taking'), (u'can', u't'),(u'anyone', u'else'),(u'month', u'ago'),(u'mg', u'day')]: bi.append(r) totalgrams = singlewords + bi return totalgrams
def stem_words(data): """ Stem words to their base linguistic stem to remove redundancy """ for val in data: val = stem(val, stemmer=PORTER) return data
def __iter__(self): for line in open(os.path.join(__location__, 'KeyVisCorpora', 'abstracts.txt'), 'rU'): line = unicode(line, errors='ignore') lowers = line.lower() tokenList = lowers.split() output = [stem(word, stemmer=LEMMA) for word in tokenList] #Assume there's one document per line, tokens separated by space yield dictionary.doc2bow([x.strip() for x in output])
def __iter__(self): for line in open( os.path.join(__location__, 'KeyVisCorpora', 'abstracts.txt'), 'rU'): line = unicode(line, errors='ignore') lowers = line.lower() tokenList = lowers.split() output = [stem(word, stemmer=LEMMA) for word in tokenList] #Assume there's one document per line, tokens separated by space yield dictionary.doc2bow([x.strip() for x in output])
def stem_words(words): """Stem words to their base linguistic stem to remove redundancy. Args: words (list): The list of words Returns: list: An updated word list with words stemmed. """ return [stem(word, stemmer=PORTER) for word in words]
def featureExtractor(textMessage,countgrams): textMessage = textMessage.lower() #Function to remove stop words stopWords = [u'i','m', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now'] avoidList1 = ['diabetes','type 2','diabetic'] avoidList = stopWords + avoidList1 #Removing these stop words and general cleaning singleGrams = [i for i in textMessage.split() if i not in avoidList] singlegramsRefined = [] #Stemming the words for normalization for k in singleGrams: r = stem(k, stemmer=LEMMA) singlegramsRefined.append(r) newMessage = " ".join(singlegramsRefined) newMessage = re.sub("[^A-Za-z]", " ", newMessage)# Removing numbers newMessage = re.sub(r'[^\w]', ' ', newMessage)# Removing all non alphanumeric chars singleGrams= [i for i in newMessage.split()] #Again splitting to single grams singlegramsRefined2 = [word for word in singleGrams] #Keep this now because it works biGrams = ngrams(newMessage, n=2)# Generating bigrams triGrams = ngrams(newMessage, n=3)#Generating trigrams totalGramsrefined = [] if countgrams == 1: totalGrams = singlegramsRefined2 totalGramsrefined = [i for i in totalGrams]# We want only those features in the text data which is in the model elif countgrams == 2: totalGrams = singlegramsRefined2+biGrams totalGramsrefined = [i for i in totalGrams] elif countgrams == 3: totalGrams = singlegramsRefined2+biGrams + triGrams totalGramsrefined = [i for i in totalGrams] return totalGramsrefined
def _validate(self, clue: str, positiveWords: np.array, negativeWords: np.array) -> bool: clue = clue.lower() invalidWords: np.array = np.append( self.previousClues, np.append(positiveWords, negativeWords)) stemmedClue: str = stem(clue) singularClue: str = singularize(clue) pluralClue: str = pluralize(clue) if not clue.isalpha() or not clue.isascii() or set( "aeiouy").isdisjoint(clue) or not 2 <= len(clue) <= 12: return False for word in invalidWords: stemmedWord = stem(word) singularWord = singularize(word) pluralWord = pluralize(word) if clue in word or word in clue or stemmedClue in word or stemmedWord in clue or \ singularClue in word or singularWord in clue or pluralClue in word or pluralWord in clue: return False return True
def visit(self, link, source=None): print('visited:', repr(link.url), 'from:', link.referrer) i = str(link).split('/') i = [stem(i[j]) for j in range(len(i))] i = '_'.join(str(e) for e in i) b = search('5g', i) Data = {} if not len(b) == 0: hash_object = hashlib.sha256(link.url) hex_dig = hash_object.hexdigest() Data['id'] = hex_dig Data['link'] = repr(link.url) #Data['source'] = FROM r = json.dumps(Data) loaded_r = json.loads(r) es.index(index='mining_links', doc_type='mining_links', id=i, body=loaded_r) Data = {}
def make_additional_keywords( self ): '''unstemmed words not in stemmed list''' assert type(self.keywords_stemmed) == list if len( self.keywords_stemmed ) > 0: assert type(self.keywords_stemmed[0]) == tuple assert type(self.keywords_unstemmed) == list if len( self.keywords_unstemmed ) > 0: assert type(self.keywords_unstemmed[0]) == tuple ## make simple stemmed keyword list from (score, word) tuple temp_simple_stemmed = [] for kw_tuple in self.keywords_stemmed: score = kw_tuple[0]; word = kw_tuple[1] temp_simple_stemmed.append( word ) ## add any additional unstemmed keywords (whose stems aren't in temp_simple_stemmed ) self.keywords_unstemmed_additional = [] for kw_tuple in self.keywords_unstemmed: score = kw_tuple[0]; word = kw_tuple[1] if word not in temp_simple_stemmed: # TODO: time using sets here instead if stem( word, stemmer=PORTER ) not in temp_simple_stemmed: self.keywords_unstemmed_additional.append( kw_tuple )
def handle_starttag(self, tag, attrs): #print(lien) for attr in attrs: a = attr[1] a = a.split('/') a = [stem(a[j]) for j in range(len(a))] #print(a) a = '_'.join(str(e) for e in a) for i in attr: l = search('src', i) b = search('path', i) b1 = search('5g', a) b2 = search('mob', a) b3 = search('imag', a) b4 = search('video', a) b5 = search('pdf', a) if not len(b1) == 0: if not len(b5) == 0: if not attr[1][2:] in Pdfs[lien]: Pdfs[lien].append(attr[1][2:]) if not len(b3) == 0: if not attr[1][2:] in Images[lien]: Images[lien].append(attr[1][2:]) if not len(b4) == 0: if not attr[1][2:] in Videos[lien]: Videos[lien].append(attr[1][2:]) if not len(l) == 0 and not len(b1) == 0: if not attr[1][2:] in Images[lien]: Images[lien].append(attr[1][2:])
def process_articles(articles, stoplist): print "Cleaning Articles: Special Characters, Stemming, Stopwords" remove_list = string.ascii_letters + string.digits cleanArticles = [] for a in articles: # html entities a = gensim.utils.decode_htmlentities(a) # Remove Unicode temp = a.decode("utf-8") temp = temp.encode("ascii", errors="ignore") # Split temp = temp.split() cleanArticle = [] for w in temp: # Lowercase w = w.lower() if w in stoplist: continue # Remove Special Chars w = "".join([l for l in w if l in remove_list]) if w != "": w = stem(w, stemmer=LEMMA) cleanArticle.append(w) cleanArticles.append(cleanArticle) print "Cleaned Articles" return cleanArticles
def test_stem(self): # Assert stem with PORTER, LEMMA and pattern.en.Word. s = "WOLVES" v1 = vector.stem(s, stemmer=None) v2 = vector.stem(s, stemmer=vector.PORTER) v3 = vector.stem(s, stemmer=vector.LEMMA) v4 = vector.stem(s, stemmer=lambda w: "wolf*") v5 = vector.stem(Word(None, s, lemma=u"wolf*"), stemmer=vector.LEMMA) v6 = vector.stem(Word(None, s, type="NNS"), stemmer=vector.LEMMA) self.assertEqual(v1, "wolves") self.assertEqual(v2, "wolv") self.assertEqual(v3, "wolf") self.assertEqual(v4, "wolf*") self.assertEqual(v5, "wolf*") self.assertEqual(v6, "wolf") # Assert unicode output. self.assertTrue(isinstance(v1, unicode)) self.assertTrue(isinstance(v2, unicode)) self.assertTrue(isinstance(v3, unicode)) self.assertTrue(isinstance(v4, unicode)) self.assertTrue(isinstance(v5, unicode)) self.assertTrue(isinstance(v6, unicode)) print("pattern.vector.stem()")
def test_stem(self): # Assert stem with PORTER, LEMMA and pattern.en.Word. s = "WOLVES" v1 = vector.stem(s, stemmer=None) v2 = vector.stem(s, stemmer=vector.PORTER) v3 = vector.stem(s, stemmer=vector.LEMMA) v4 = vector.stem(s, stemmer=lambda w: "wolf*") v5 = vector.stem(Word(None, s, lemma=u"wolf*"), stemmer=vector.LEMMA) v6 = vector.stem(Word(None, s, type="NNS"), stemmer=vector.LEMMA) self.assertEqual(v1, "wolves") self.assertEqual(v2, "wolv") self.assertEqual(v3, "wolf") self.assertEqual(v4, "wolf*") self.assertEqual(v5, "wolf*") self.assertEqual(v6, "wolf") # Assert unicode output. self.assertTrue(isinstance(v1, unicode)) self.assertTrue(isinstance(v2, unicode)) self.assertTrue(isinstance(v3, unicode)) self.assertTrue(isinstance(v4, unicode)) self.assertTrue(isinstance(v5, unicode)) self.assertTrue(isinstance(v6, unicode)) print "pattern.vector.stem()"
def parse_message(self, text, usernick, channel): if channel != self.chan: userOrFalse = usernick else: userOrFalse = False words = re.findall(r"\b[\w]+\b", text.lower()) tokens = text.lower().split() original_words = words[:] try: words.remove(self.nick) except: pass try: words.remove('hst') except: pass try: tree = parsetree(' '.join(words)) firstNoun = match('NN|NNS|NNP|NNPS', tree) except: firstNoun = None # print original_words if self.nick in original_words: if set(words) & set(['help', 'commands']): commandsTemp = Template(self.commands) self.send_msg( commandsTemp.substitute(usernick=usernick, botnick=self.nick), channel=userOrFalse ) elif '?' in text or (set(words) & set(['who', 'where', 'when', 'what', 'why', 'how'])): fileObj = open('weird_grammar.json', 'r') jsonObj = json.load(fileObj) fileObj.close() s = sentiment(text)[0] if s > 0: print s * 2500 + 1 self.send_msg( make_polar(jsonObj, int(s * 2500 + 1)), channel=userOrFalse ) else: print s * 2500 - 1 self.send_msg( make_polar(jsonObj, int(s * -2500 - 1), sent=0), channel=userOrFalse ) elif firstNoun is not None: print firstNoun.string.replace('_', ' ') s = sentiment(text)[0] sentences = sorted( pf_sentences(abs(s*1000+3), firstNoun.string.replace('_', ' ')), key = lambda x: sentiment(x)[0] ) if s > 0: # print s * 2500 + 1 self.send_msg( ' '.join(sentences[-3:]), channel=userOrFalse ) else: # print s * 2500 - 1 self.send_msg( ' '.join(sentences[:3]), channel=userOrFalse ) else: snarkTemp = Template(rc(self.snarklist)) self.send_msg( snarkTemp.substitute(usernick=usernick, botnick=self.nick), channel=userOrFalse ) if tokens[0] == '.seen': tgt_user = tokens[1] if tgt_user in self.seen_dict: last_time, last_msg = self.seen_dict[tgt_user] self.send_msg( "%s: %s last seen on %s saying: %s" % (usernick, tgt_user, last_time, last_msg), channel=userOrFalse ) else: self.send_msg( "%s: I haven't seen %s." % (usernick, tgt_user), channel=userOrFalse ) elif tokens[0] == '.tell': tgt_user = tokens[1] if not tgt_user in self.tells_dict: self.tells_dict[tgt_user] = [] self.tells_dict[tgt_user].append((usernick, ' '.join(tokens[2:]))) self.send_msg( "%s: Ok, I'll tell %s that for you." % (usernick, tgt_user), channel=userOrFalse ) with open('tells_dict.json', 'w') as outfile: json.dump(self.tells_dict, outfile) elif tokens[0] == '.showtells': if not usernick in self.tells_dict or not self.tells_dict[usernick]: self.send_msg("%s: I have nothing for you." % usernick, channel=usernick) else: while self.tells_dict[usernick]: src_user, tell_msg = self.tells_dict[usernick].pop() self.send_msg("%s said: %s" % (src_user, tell_msg), channel=usernick) with open('tells_dict.json', 'w') as outfile: json.dump(self.tells_dict, outfile) elif tokens[0] == '.gif': gif_url = get_gif(tokens[1:]) self.send_msg("%s: %s" % (usernick, gif_url), channel=userOrFalse) elif tokens[0] == '.wiki': try: wiki_url, wiki_text = get_wiki_article(tokens[1:]) except: self.send_msg( "%s: I'm sorry, but something went wrong!" % usernick, channel=userOrFalse ) else: if wiki_text: safe_wiki_text = ''.join(list(wiki_text)[:300]).replace('\n', ' ') + '...' safe_wiki_text = safe_wiki_text.encode('ascii', 'ignore') self.send_msg( "%s: %s | %s" % (usernick, wiki_url, safe_wiki_text), channel=userOrFalse ) else: self.send_msg( "%s: I'm sorry, but something went wrong!" % usernick, channel=userOrFalse ) elif tokens[0] == '.yt': try: result = youtube_search(tokens[1:]) result = map(lambda x: x.encode('ascii', 'ignore'), result) title, desc, vidId = result self.send_msg( "%s: %s | %s | https://www.youtube.com/watch?v=%s" % (usernick, title, desc, vidId), channel=userOrFalse ) except: self.send_msg( "%s: I'm sorry, but something went wrong!" % usernick, channel=userOrFalse ) elif tokens[0] == '.hst': # self.send_msg('/nick drgonzo') if firstNoun is not None: lookupFile = open("hst_lookup.json", 'r') lookup = json.load( lookupFile ) lookupFile.close() nounStem = stem(firstNoun, stemmer=PORTER) idHash = None print nounStem try: # switch to descending idHash = rc(lookup[nounStem]) print idHash except KeyError: pass try: idHash = rc(lookup[firstNoun]) print idHash except KeyError: pass if idHash is not None: bookFile = open("hst_text.json", 'r') books = json.load( bookFile ) bookFile.close() text = books[idHash].encode('ascii', 'ignore') # print text self.send_msg("%s: %s"%(usernick,text), channel=userOrFalse) else: self.send_msg("%s: Can't say I know it." % usernick, channel=userOrFalse) else: self.send_msg("%s: Nothing to say about that." % usernick, channel=userOrFalse) # self.send_msg('/nick itpbot') if "ross" in words: self.send_msg("%s: I hope you're not speaking ill of my creator." % usernick, channel=userOrFalse) if "itp" in words: message = rand_itp_acronym() self.send_msg(message, channel=userOrFalse)
def stem(self, word): return stem(word, stemmer=PORTER)
def token_hypernyms(token, recursive, depth): '''Stem each token using default stemmer from the pattern library (PORTER?)''' for synset in wordnet.synsets(stem(token)): for hypernym in synset.hypernyms(recursive, depth): for sense in hypernym.senses: yield sense
def featureExtractor(textMessage,countgrams): textMessage = textMessage.lower() #Function to remove stop words stopWords = [u'i','m', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now'] avoidList1 = ["actos", "pioglitazone hydrochloride", "pioglitazone", "glustin", "glizone", "pioz", "zactos"] avoidList2 = ["medformin","metfornin","metforin","glucophage", "metformin", "glucophage xr", "metformin hydrochloride","carbophage sr", "riomet", "fortamet", "glumetza", "obimet", "gluformin", "dianben", "diabex", "diaformin", "siofor","metfogamma", "riomet","diformin","metformi","metphormin","metaforming","metfirman","metoformin","metfomin"] avoidList3 = ["byetta", "bydureon", "exenatide","byetta"] avoidList4 = ["victosa","victoza", "liraglutide", "saxenda","victoza"] avoidList5 = ["invokana", "invokana","canagliflozin"] avoidList6 = ["avandia", "rosiglitazone"] avoidList7 = ["insu","humalog","levimir","novolog","insuline","insulin glargine","insulins","lantus", "toujeo", "abasaglar", "basaglar","insulin","insulins","levamir","levemir"] avoidList8 = ["sitagliptin", "janumet", "januvia", "juvisync","junuvia","januvia","sitaglipton"] avoidList9 = ["amaryl", "glimepiride", "gleam", "k-glim-1", "glucoryl", "glimpid", "glimy","ameryl"] avoidList10 = ['diabetes','type 2','diabetic'] avoidList = stopWords + avoidList1 + avoidList2 + avoidList3 + avoidList4 + avoidList5 + avoidList6 + avoidList7 + avoidList8 + avoidList9 + avoidList10 #Removing these stop words and general cleaning singleGrams = [i for i in textMessage.split() if i not in avoidList] singlegramsRefined = [] #Stemming the words for normalization for k in singleGrams: r = stem(k, stemmer=LEMMA) singlegramsRefined.append(r) newMessage = " ".join(singlegramsRefined) newMessage = re.sub("[^A-Za-z]", " ", newMessage)# Removing numbers newMessage = re.sub(r'[^\w]', ' ', newMessage)# Removing all non alphanumeric chars singleGrams= [i for i in newMessage.split()] #Again splitting to single grams singlegramsRefined2 = [word for word in singleGrams] #Keep this now because it works biGrams = ngrams(newMessage, n=2)# Generating bigrams triGrams = ngrams(newMessage, n=3)#Generating trigrams listModelfeatures = modelFeatures() totalGramsrefined = [] if countgrams == 1: totalGrams = singlegramsRefined2 totalGramsrefined = [i for i in totalGrams if i in listModelfeatures]# We want only those features in the text data which is in the model elif countgrams == 2: totalGrams = singlegramsRefined2+biGrams totalGramsrefined = [i for i in totalGrams if i in listModelfeatures] elif countgrams == 3: totalGrams = singlegramsRefined2+biGrams + triGrams totalGramsrefined = [i for i in totalGrams if i in listModelfeatures] return totalGramsrefined
filter=lambda w: w.strip("'").isalnum(), punctuation='.,;:!?()[]{}`' '\"@#$^&*+-|=~_') # returns a list of words by splitting the string on spaces. freq_dic = count( # takes a list of words and returns a dictionary of (word, count)-items. words=words_list, top=None, # Filter words not in the top most frequent (int). threshold=0, # Filter words whose count <= threshold. stemmer=None, # PORTER | LEMMA | function | None exclude=[], # Filter words in the exclude list. stopwords=False, # Include stop words? language='en') # en, es, de, fr, it, nl for k, v in freq_dic.iteritems(): print k, v # stop words and stemming print stem('spies', stemmer=PORTER) print stem('spies', stemmer=LEMMA) s = 'The black cat was spying on the white cat.' print count(words(s), stemmer=PORTER) print count(words(s), stemmer=LEMMA) s = 'The black cat was spying on the white cat.' s = Sentence(parse(s)) print count(s, stemmer=LEMMA) # character n-grams print chngrams('The cat sat on the mat.'.lower(), n=3) # document text = "The shuttle Discovery, already delayed three times by technical problems and bad weather, was grounded again" \ "Friday, this time by a potentially dangerous gaseous hydrogen leak in a vent line attached to the shipʼs" \ "external tank. The Discovery was initially scheduled to make its 39th and final flight last Monday, bearing" \ "fresh supplies and an intelligent robot for the International Space Station. But complications delayed the" \ "flight from Monday to Friday, when the hydrogen leak led NASA to conclude that the shuttle would not be ready" \
def stemmer(self,word): #stemmer=None, stemmer=LEMMA, stemmer=PORTER print stem(word,stemmer=PORTER)
os.path.join(os.getcwd(), os.path.dirname(__file__))) """(1) Read from abstracts.txt populated by corporaReader.py""" print "Reading abstracts.txt ..." abstractList = [] with open(os.path.join(__location__, 'KeyVisCorpora', 'abstracts.txt'), 'rU') as inputFile: document = inputFile.readlines() for abstract in document: abstractList.append(abstract) print "Finished reading %i abstracts.txt!" % len(abstractList) """(2) Create token list from abstractList; unicode encoding""" print "Creating token list ..." abstractTokens = [[ unicode(word, "utf-8", errors="ignore") for word in line.split() ] for line in abstractList] abstractTokens = [[stem(word, stemmer=LEMMA) for word in line] for line in abstractTokens] """Build dictionary and do dictionary pre-processing""" print "Building dicitonary ..." dictionary = corpora.Dictionary(abstractTokens) #remove stop words and words that appear only once stopwords = stopwords.words('english') exclusionlist = ['-', 'se', 'h', 'd', 'iee'] #manually populated; add to this if necessary stopwords = stopwords + exclusionlist stop_ids = [ dictionary.token2id[stopword] for stopword in stopwords if stopword in dictionary.token2id ] once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1
def extract_overlap(blah, lookup): # Parse input text... s = parsetree(blah, relations=True, lemmata=True) keywords = [] raw = [] adj = [] all_tokens = [] for sentence in s: for chunk in sentence.subjects + sentence.objects: h = chunk.head if h.type != "PRP": keywords.append(h.string) raw.append(chunk.string) for word in sentence.words: if word.type == "JJ": adj.append(word.string) all_tokens.append(word.string) # Make candidate lists... key_cand = [] syn_key_cand = [] stem_key_cand = [] adj_cand = [] syn_adj_cand = [] stem_adj_cand = [] all_cand = [] syn_all_cand = [] stem_all_cand = [] last_resort = [] tags = [k.lower() for k in lookup.keys()] for word in keywords: if word.lower() in tags: key_cand.append(word.lower()) for synword in synonymous(word): if synword.lower() in tags: syn_key_cand.append(synword.lower()) if stem(word.lower(), stemmer=PORTER) in tags: stem_key_cand.append(stem(word.lower(), stemmer=PORTER)) for word in adj: if word.lower() in tags: adj_cand.append(word.lower()) for synword in synonymous(word): if synword.lower() in tags: syn_adj_cand.append(synword.lower()) if stem(word.lower(), stemmer=PORTER) in tags: stem_adj_cand.append(stem(word.lower(), stemmer=PORTER)) for word in all_tokens: if word.lower() in tags: all_cand.append(word.lower()) for synword in synonymous(word): if synword.lower() in tags: syn_all_cand.append(synword.lower()) if stem(word.lower(), stemmer=PORTER) in tags: stem_all_cand.append(stem(word.lower(), stemmer=PORTER)) for k in tags: if k.lower() in blah.lower(): last_resort.append(k) cand = key_cand + stem_key_cand + adj_cand + stem_adj_cand + all_cand + stem_all_cand + syn_key_cand + syn_adj_cand + syn_all_cand + last_resort if cand == None: cand = [] return cand
def lemmatize(self, word): return stem(word, stemmer=LEMMA)
"""(1) Read from abstracts.txt populated by corporaReader.py""" print "Reading abstracts.txt ..." abstractList = [] with open(os.path.join(__location__, 'KeyVisCorpora', 'abstracts.txt'), 'rU') as inputFile: document = inputFile.readlines() for abstract in document: abstractList.append(abstract) print "Finished reading %i abstracts.txt!" % len(abstractList) """(2) Create token list from abstractList; unicode encoding""" print "Creating token list ..." abstractTokens = [[unicode(word, "utf-8", errors = "ignore") for word in line.split()] for line in abstractList] abstractTokens = [[stem(word, stemmer=LEMMA) for word in line] for line in abstractTokens] """Build dictionary and do dictionary pre-processing""" print "Building dicitonary ..." dictionary = corpora.Dictionary(abstractTokens) #remove stop words and words that appear only once stopwords = stopwords.words('english') exclusionlist = ['-', 'se', 'h', 'd', 'iee'] #manually populated; add to this if necessary stopwords = stopwords + exclusionlist stop_ids = [dictionary.token2id[stopword] for stopword in stopwords if stopword in dictionary.token2id] once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq ==1] dictionary.filter_tokens(stop_ids) #remove them from the dictionary "dictionary.filter_tokens(stop_ids + once_ids)" dictionary.filter_tokens(once_ids) #remove terms that only occur once dictionary.compactify() # remove gaps in id sequence after words that were removed dictionary.save(os.path.join(__location__, 'data/KeyVis.dict')) #store dictionary for future reference