def __init__(self, id, context): self.id = id self.words = [] self.context = [context] self.stemmer = stemmer.PorterStemmer() self.feature = [] self.items = {}
def indexText(text): # prepare text lines = text.split('.') clean_lines = [line.strip() for line in lines if line.strip()] newtext = '\n'.join(clean_lines) words = textmining.simple_tokenize(newtext) p = stemmer.PorterStemmer() # filter stop words text = open('stopwords.txt').read() stopwords = textmining.simple_tokenize(text) # use stemming stemmed = [] freq = {} occur = {} for index, w in enumerate(words): stem = p.stem(w, 0, len(w) - 1) stemmed.append(stem) if stem not in stopwords: freq[stem] = stemmed.count(stem) occur[stem] = w sorted_freq = sorted(freq.iteritems(), key=operator.itemgetter(1), reverse=True) # Concordance most_freq_words = sorted_freq[:1] print "------Index-----" print occur[most_freq_words.pop()[0]] print "----------------" return occur
def freq_analysis(conn, cursor, cuisine, ingredients_list): """Performs analysis on a cuisine's list of 100 recipes. Finds term frequencies. Each ingredient is stemmed and checked against a stopwords list. Term frequencies are store in a SQLite db with their associated cuisine type. """ stopwords = [word[:-1] for word in open('stopwords.txt', 'r')] pstemmer = stemmer.PorterStemmer() freq = defaultdict(lambda: 1) # used to map word stems to whole words mapping = defaultdict(list) for ingredients in ingredients_list: for ingredient in ingredients.split(): ingredient = remove_punc(ingredient) if ingredient not in stopwords: ingredient_stem = pstemmer.stem(ingredient, 0, len(ingredient) - 1) freq[ingredient_stem] += 1 mapping[ingredient_stem].append(ingredient) #for x,y in sorted(freq.iteritems(), key=lambda x: x[1], reverse=True): for ingred, freq in sorted(freq.iteritems(), key=operator.itemgetter(1), reverse=True): nonstemmed_ingred = mapping[ingred] # if multiple words map to the same stem, take the shortest one in length dbingred = min(nonstemmed_ingred, key=lambda candidate: len(candidate)) insert_ingred(cursor, cuisine, dbingred, freq)
def __init__(self): self.stemmer = stemmer.PorterStemmer() self.cleaner = Cleaner(style=True) self.stopWords = [] if os.path.exists('stopWords'): self.stopWords = [line.strip() for line in open('stopwords')] else: self.stopWords = features.stopWords
def _clean_word(self, word): word = word.lower() for punc in Document.PUNCTUATION + Document.CARRIAGE_RETURNS: word = word.replace(punc, '').strip("'") # stemmer: dogs -> dog ; created -> creat ps = stemmer.PorterStemmer() word = ps.stem(word, 0, len(word) - 1) return word if re.match(Document.WORD_REGEX, word) else None
def text_tokenizer(text): p = ps.PorterStemmer() tokenized = text.split() tokenized = [x.strip(bad_chars) for x in tokenized if '&#' not in x and x != '' and '<' not in x] for t in tokenized: if t not in terms_before_preprocess: terms_before_preprocess[t] = 1 else: terms_before_preprocess[t] += 1 return [p.stem(x.lower(), 0, len(x)-1) for x in tokenized]
def __init__(self, storedb, logfunc=None): print 'Initializing ANEW module' self.store_db = storedb self.valence_db = ValenceDB() self.stemmer = stemmer.PorterStemmer() self.logfunc = logfunc self.total_count = 0 # Connect to ANEW database self.valence_db.connect()
def stem(word): """ Returns Porter stemmed version of words. Input can either be a string or list of strings. """ p = stemmer.PorterStemmer() if isinstance(word, str): # Input is a single word return p.stem(word, 0, len(word) - 1) else: # Assume input is a list ot words return [p.stem(w, 0, len(w) - 1) for w in word]
def stemWords(tokenList): p = stemmer.PorterStemmer() stemmedList = [] for word in tokenList: prevWord = "" # stem the token until it doesn't change any more. while word != prevWord: prevWord = word; word = p.stem(word, 0, len(word) - 1) stemmedList.append(word); return stemmedList
def normalizeText(self, text): text = text.lower() text = re.sub('[^0-9a-zA-Z]+', ' ', text) articleWords = text.split() articleWords = self.removeStopWords(articleWords) stemmedWords = [] for word in articleWords: p = stemmer.PorterStemmer() stemmed = p.stemWord(word) self.reverseStemHashtable[stemmed] = word stemmedWords.append(stemmed) return stemmedWords
def parser(self): """ Here I use html5lib so parse the pages retrieved. I am using BeautifulSoup as my parser here and I know it is deprecated. I will change this soon... Content is taken only from <p> tags, so this could be a lot more robust. All words are stemmed and stopwords are removed. """ #get stopwords; remove newline char parsed_html = {} stopwords = [word[:-1] for word in open('stopwords.txt')] pstemmer = stemmer.PorterStemmer() htmldocs = os.listdir('pages/') #grap all html docs and parse them words_splitter = re.compile(r'\W*') #split on non words for htmldoc in htmldocs: f = open('pages/' + htmldoc, 'r') link = f.readline() html = f.readlines() try: print htmldoc p = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder('beautifulsoup')) tree = p.parse(html) except: os.remove(os.path.join('pages', htmldoc)) print 'error parsing %s' % htmldoc continue title = tree.findAll('title') if title: title = title[0].text else: title = '' #grab text from p tags data = [p.text.lower() for p in tree.findAll('p')] #remove stopwords unstemmed_words = [ word for word in words_splitter.split(''.join(data)) if word != '' and word not in stopwords ] stemmed_words = [ pstemmer.stem(word, 0, len(word) - 1) for word in unstemmed_words ] parsed_html[(title, int(htmldoc), link)] = stemmed_words return parsed_html
def text_tokenizer(text, topic, isTrain, doc): p = ps.PorterStemmer() tokenized = [ x.strip(bad_chars) for x in text.split() if '&#' not in x and x != '' and '<' not in x ] tokenized = [ p.stem(x.lower(), 0, len(x) - 1) for x in tokenized if x not in stopwords ] terms = collections.Counter(tokenized) if isTrain == 1: tokens_by_topic[topic] += tokenized add_to_dict(list(terms), doc) return terms
def stem(f): p = stemmer.PorterStemmer() infile = open(f, 'r') while 1: output = '' word = '' line = infile.readline() if line == '': break for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() #print output des_filename = "pre_stem.txt" open(des_filename, 'w').writelines(output) infile.close()
def getWordsFrom(string): # get the list of words from './string' directory string = string.lower() wordList = [] for root, dirs, files in os.walk('./'): if string in root.lower(): for file in files: f = open(root+'/'+file, 'r') tmp = [ x.lower() for x in re.split('[^a-zA-Z]+',f.read())] #Only select English words in the list tmp = filter(lambda a: (len(a) >= 2) & (a not in unusedWords), tmp) #Remove non-useful words to detect spam or not wordList += tmp f.close() p = stemmer.PorterStemmer() for i in xrange(len(wordList)): wordList[i] = p.stem(wordList[i], 0,len(wordList[i])-1) wordList = list(set(wordList)) #Remove overlapping words wordList.sort() return wordList
def __init__(self): self.stemmer = stemmer.PorterStemmer() self.stop_words = self.load_stop_words( os.path.join(os.path.abspath(os.path.dirname(__file__)), 'english.stop'))
def and_not_comp(set1, set2): #print '\t', set1, '-', set2 return set2.difference(set1) def clean(str): str = str.replace(',', '').replace('(', '').replace(')', '').replace( '\'', '').replace('"', '').replace(';', '').replace('.', '') return str operators = {'&&': and_comp, '||': or_comp, '&^': and_not_comp} stemmer = stemmer.PorterStemmer() def index(id, doc): terms = doc.split() for term in terms: term = term.lower() term = clean(term) doc_ids = inverted_index.get(term) if doc_ids: doc_ids.add(id) else: inverted_index[term] = set() inverted_index[term].add(id)
import stemmer as ps import pickle import sys dictionary = pickle.load(open("dictionary", "rb")) posindex = pickle.load(open("posindex", "rb")) p = ps.PorterStemmer() def handle_conj_query(word_list): try: w1_id = dictionary[word_list[0]] except: return [] doclist1 = list(posindex[w1_id].keys()) for i in range(1, len(word_list)): if not doclist1: return [] w2_id = dictionary[word_list[i]] doclist2 = list(posindex[w2_id].keys()) doclist1 = [x for x in doclist1 if x in doclist2] return doclist1 def handle_phrase_query(word_list): return handle_prox_query(word_list, [0] * (len(word_list) - 1)) def handle_prox_query(word_list, prox_list):
def __init__(self, imap): self.ps=stemmer.PorterStemmer() self.imap=imap self.db=psycopg.connect('dbname=imapindex host=db user=dustin ' + \ 'password=blahblah', serialize=0) self.c=self.db.cursor()