class WindowPorterStemStringFeature(object): def __init__(self): self._stemmer = PorterStemmer() def get_id(self): return 'WINDOW-STEM-STRING' def _stem(self, token): return self._stemmer.stem(token, 0, len(token) - 1) def featurise(self, document, sentence, annotation): NORMALISE = True before_ann = sentence.text[:annotation.start].split() before_ann.reverse() after_ann = sentence.text[annotation.end:].split() to_yield = [] for i, tok in izip(xrange(1, 4), before_ann): to_yield.append((u'-BEFORE-{}-{}'.format(i, self._stem(tok)), 1)) for i, tok in izip(xrange(1, 4), after_ann): to_yield.append((u'-AFTER-{}-{}'.format(i, self._stem(tok)), 1)) for f_tup in to_yield: if NORMALISE: yield (f_tup[0], f_tup[1] / float(len(to_yield))) else: yield f_tup
def stem_words(self, terms): """ Remove the suffixes in terms. """ porter_stemmer = PorterStemmer() #we use the porter stemming algoritm terms = [porter_stemmer.stem(word) for word in terms] return terms
def main(argv): files = os.listdir(sys.argv[1]) file = files[0] stemmed = [] for file in files: text = "" infile = open(sys.argv[1] + file) a = infile.readline() while a: text += removeSGML(a) a = infile.readline() tok = tokenizeText(text) removed = removeStopwords(tok) from porter import PorterStemmer p = PorterStemmer() for element in removed: stemmed.append(p.stem(element, 0, len(element) - 1)) print "Words " + str(len(stemmed)) unique = list(set(stemmed)) print "Vocabulary " + str(len(unique)) wordfrequency = [(unique[x], stemmed.count(unique[x])) for x in range(0, len(unique))] sort = sorted(wordfrequency, key=getKey, reverse=True) for i in range(0, 49): print sort[i]
def search(self, word): # Create an instance of the Porter Stemmer. PS = PorterStemmer() # Get the information for the supplied word. res = self.manage_DB.get_info("index_word", where={"word": PS.stem(word, 0, len(word) - 1)}) # The supplied word exist in the index_word table. if res: # Extract the id for the supplied word. wordid = res["id"] # Get all the entries in the index reference database that refer to # the supplied wordid. res = self.manage_DB.get_info("index_ref", where={"wordid": wordid}) # For ever entry in the list. for row in res: # Modify the current row to contain the stem word. row["word"] = self.manage_DB.get_info("index_word", rowid=row[1])["word"] # Modify the current row to contain the document name. row["doc"] = self.manage_DB.get_info("document", rowid=row[2])["name"] # Return the list of all the results. return res # The supplied word does not exist in the index_word table, so return # and empty list. else: return []
def stemList(list1): p = PorterStemmer() lAllWordsStemmed = [] for word in list1: word = p.stem(word,0,len(word)-1) lAllWordsStemmed.append(word) return lAllWordsStemmed
def stemWords(input): porter = PorterStemmer() words = input for index, word in enumerate(words): words[index] = porter.stem(word, 0, len(word) - 1) return words
def get_stemmed_words(word_list): stemmer = PorterStemmer() stemmed_words = set() for word in word_list: stemmed_words = stemmed_words.union(stemmer.stem(word, 0,len(word)-1)) return stemmed_words
def stemWords(tokens): ''' input: list of tokens output: list of stemmed tokens use the porter.py ''' p = PorterStemmer() return map(lambda t: p.stem(t, 0, len(t) - 1), tokens)
def read_vocabulary(path): f = open(path) vocab = f.read() words = vocab.strip().split(", ") vocab = [] stemmer = PorterStemmer() for word in words: vocab.append(stemmer.stem(word, 0, len(word)-1)) return vocab
def process(input): s2 = tokenizeText(input) s3 = removeStopwords(s2) pr = [] from porter import PorterStemmer p = PorterStemmer() for element in s3: pr.append(p.stem(element, 0, len(element) - 1)) return pr
def process(input): s3 = tokenizeText(input) """s3 = removeStopwords(s2)""" pr = [] from porter import PorterStemmer p = PorterStemmer() for element in s3: pr.append(p.stem(element, 0, len(element)-1)) return pr
def create_posting_list(self, stopword_toggle, stemming_toggle): """ function to go through all the documents abstracts cleaning and adding each term to a posting_list object and the term dictionary. removes all the special characters for each term. toggles stopwords and stemming accordingly Note: all terms are converted to lowercase :param stopword_toggle: boolean, toggles the stopword usage :param stemming_toggle: boolean, toggles the stemming of words """ self.terms = {} self.termsDictionary = {} documents = self.documents stopwords = [] if stopword_toggle: stopwords = fetch_stopwords() for doc_id, document in documents.items(): if 'abstract' in document: for index, word in enumerate(document['abstract'].split(' ')): word = word.rstrip().lower() for a in [',', '.', '{', '}', '(', ')', ';', ':', '"', '\'']: if a in word: if word.index(a) == 0 or word.index(a) == len(word) - 1: word = word.replace(a, '') if stemming_toggle: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) if word in stopwords: continue if len(word) > 0: if word not in self.terms.keys(): self.terms[word] = {} if doc_id not in self.terms[word].keys(): self.terms[word][doc_id] = { 'frequency': 0, 'position': [], } self.terms[word][doc_id]['frequency'] += 1 self.terms[word][doc_id]['position'].append(index) for term, value in self.terms.items(): self.termsDictionary[term] = len(value) f = open('dictionary.json', 'w') f.write(json.dumps(self.termsDictionary, indent=4, sort_keys=True)) f.close() f = open('posting-list.json', 'w') f.write(json.dumps(self.terms, indent=4, sort_keys=True)) f.close()
def index_document(self, docid, path_physical): self.manage_DB.delete_references(docid) # Get the information for the supplied document. document = self.manage_DB.get_info('document', rowid=docid) # Open the document for reading. fhandle = open('%s%s' % (path_physical, docid), 'r') # Create an instance of the Porter Stemmer. PS = PorterStemmer() # Get the 1st line of the supplied document and force the contents to # lowercase. content = fhandle.readline().lower() # The text widget starts indexing its lines at 1, but columns start # indexing at 0. line_count = 1 # While the supplied document has content to be read. while content != '': # Find all words from the current line of the supplied document # and put them in a list. words = re.findall('\w+', content) # For each word in the list of words from the current line. for word in words: # Only words whose length is greater than 3 will be indexed. if len(word) > 3: # Check for the word in the list of stop words. res = self.manage_DB.get_info('stop_words', where={ 'word': word}) # If the word does not exist in the list of stop words: if not res: # The column of the current word is its index in the # current line. col_count = content.find(word) + 1 # Using the PorterStemmer, find the root of the current # word. Add the root word, with the current line and # column number to the index. self.add_index_word( PS.stem(word, 0, len(word) - 1), docid, line_count, col_count, word) # Get the next line of the supplied document and force the # contents to lowercase. content = fhandle.readline().lower() # Increment the line count. line_count += 1 # Close the supplied document file. fhandle.close() return
def stem(tokens): """ receive tokens return stemmedTokens """ stemmedTokens = [] stemmer = PorterStemmer() for token in tokens: stemmedTokens.append(stemmer.stem(token, 0, len(token) - 1)) return stemmedTokens
def tokenize(inputStr): tokenPattern = re.compile(r'[^a-zA-Z0-9.,_]') # tokenPattern = re.compile(r'[\s:?;()\[\]&!*@#$%+<>/\\\'\"]|\.(\.)+|(-)+') primordialTokens = re.split(tokenPattern, inputStr) # primordialTokens = inputStr.replace(">", " ").replace("...", " ").replace("-"," ").replace("'"," ").replace("/"," ").split(' ') stripPuncTokens = [x.strip(',.').replace(",","").lower() for x in primordialTokens if x != None] stripPuncTokens = [x for x in stripPuncTokens if x != '' and x not in stop_words] #stemming p = PorterStemmer() stemmedTokens = [p.stem(x, 0, len(x)-1) for x in stripPuncTokens] return stemmedTokens
class StringPorterStemFeature(object): def __init__(self): self._stemmer = PorterStemmer() def _stem(self, token): return self._stemmer.stem(token, 0, len(token) - 1) return self._stemmer.stem(token, 0, len(token) - 1) def get_id(self): return 'STRING-STEM' def featurise(self, document, sentence, annotation): yield (self._stem(sentence.annotation_text(annotation)), 1)
class SentencePorterStemStringFeature(object): def __init__(self): self._stemmer = PorterStemmer() def _stem(self, token): return self._stemmer.stem(token, 0, len(token) - 1) def get_id(self): return 'STRING-SENTENCE-STEM' def featurise(self, document, sentence, annotation): for token in sentence.text.split(): yield (self._stem(token), 1)
def porter(text): p = PorterStemmer() output = '' word = '' line = text.split('\n') for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() return output.split()
def add(self, text, fname, stem=False): """Add a string of text to the corpus by first splitting it into features defined by WORD_PAT, and then removing stop words. Takes a string as its argument.""" for match in re.finditer(self.WORD_PATT, text): if match: word = match.group(0).lower() if word in self.STOPWORDS: self.removed.append(word) self.words.add_word(word, fname) continue if stem: p = PorterStemmer() word = p.stem(word, 0, len(word)-1)
def revise_documents(docs, vocab): stemmer = PorterStemmer() senses = {} # {reference:sense} for ref, text in docs.items(): words = re.findall(r"[\w']+", text) word_list = [] for w in words: if w == "tag": continue if w.isdigit() and int(w) > 100000: senses[ref] = w continue if w in vocab: word_list.append(stemmer.stem(w.lower(), 0, len(w) - 1)) docs[ref] = word_list return docs, senses
def score_query(self, query, word_matrix, normalized_matrix, stop_words_list, title_vocabulary_dict): porter_stemmer = PorterStemmer() square_sum = 0 words = {} for word in query: word_without_punctuation = word.strip(string.punctuation).replace( " ", "").lower() if word_without_punctuation not in stop_words_list: stemmed_word = porter_stemmer.stem( word_without_punctuation, 0, len(word_without_punctuation) - 1) if stemmed_word not in words: words[stemmed_word] = {} words[stemmed_word]['repetitions'] = 0 words[stemmed_word]['repetitions'] += 1 for word, elements in words.items(): square_sum += math.pow(elements['repetitions'], 2) for word, elements in words.items(): if word in word_matrix: words[word]['normalized'] = words[word][ 'repetitions'] / math.sqrt(square_sum) words[word]['weight'] = words[word][ 'normalized'] * word_matrix[word]['idf'] else: words[word]['normalized'] = 0 words[word]['weight'] = 0 aggregate_scores = {} title_addition_performed = [] for word, elements in words.items(): if word in normalized_matrix: for doc_id, doc_weight in normalized_matrix[word].items(): if doc_id not in aggregate_scores: aggregate_scores[doc_id] = 0 aggregate_scores[doc_id] += doc_weight * elements['weight'] if word in title_vocabulary_dict: if doc_id in title_vocabulary_dict[ word] and doc_id not in title_addition_performed: aggregate_scores[doc_id] += 0.5 title_addition_performed.append(doc_id) return aggregate_scores
def search(self, word): # Create an instance of the Porter Stemmer. PS = PorterStemmer() # Get the information for the supplied word. res = self.manage_DB.get_index_word_info( PS.stem(word, 0, len(word) - 1)) # The supplied word exist in the index_word table. if res: # Extract the id for the supplied word. wordid = res['id'] # Return the found entries as a list. res = [] # Query the index_ref table for all the entries whose wordid # match the supplied word's id. self.c.execute("""select * from index_ref where wordid=?""", (wordid,)) # Retrieve all the results of the query as a list. entries = self.c.fetchall() # For ever entry in the list. for row in entries: # Create a dictionary with the results and add the dictionary # to the list. res.append({ 'id': row[0], 'word': self.manage_DB.get_index_word_info(row[1])['word'], 'docid': row[2], 'doc': self.manage_DB.get_document_info(row[2])['name'], 'line': row[3], 'column': row[4], 'branch_word': row[5]}) # Return the list of all the results. return res # The supplied word does not exist in the index_word table, so return # and empty list. else: return []
def tokenize_on_porter(text): word_list = [] p = PorterStemmer() outfile = open('out3', 'w') for line in text.splitlines(): output = '' word = '' if line != '': for c in line: if c.isalpha(): word += c.lower() else: if word: word_stem = p.stem(word, 0, len(word)-1) output += word_stem word_list.append(word_stem) word = '' output += c.lower() print(output, end='\n', file=outfile) outfile.close() return word_list
def make_cloud(self): stemdict, tempdict, finaldict = {}, {}, {} stopwords = open('stopwords.txt', 'r').read().split('\n') # Extract just the words inside quotes quotes = ' '.join(self.extract_quotes()) wordlist = re.split('\s+', quotes.lower()) p = PorterStemmer() punctuation = re.compile(r'[.?!,":;-]') # Stem all of the words in the word list using the Porter Stemmer for w in wordlist: w = punctuation.sub('', w) s = p.stem(w, 0,len(w)-1) try: tempdict[w] += 1 except: tempdict[w] = 1 stemdict.setdefault(s,{}).update({w:tempdict[w]}) cumfreq = 0 # Calculate the cumulative frequencies of the stemmed words for k, v in stemdict.items(): for l, m in v.items(): cumfreq = cumfreq + m items = v.items() items.sort(lambda x, y: cmp(y[1], x[1])) finaldict[items[0][0]] = cumfreq cumfreq = 0 # Remove stopwords like "the", "it", "a", etc. for word in stopwords: try: del finaldict[word] except: pass results = self.process_cloud(8, finaldict.items()[:50]) return results
def search(self, word, docid=None): # Create an instance of the Porter Stemmer. PS = PorterStemmer() # Get the information for the supplied word. res = self.manage_DB.get_info('index_word', where={ 'word': PS.stem(word, 0, len(word) - 1)}) # The supplied word exist in the index_word table. if res: # Extract the id for the supplied word. wordid = res[0]['id'] if docid: # Get all the entries in the index reference database that refer to # the supplied wordid. res = self.manage_DB.get_info('index_ref', where={ 'wordid': wordid, 'docid': docid}) else: # Get all the entries in the index reference database that refer to # the supplied wordid. res = self.manage_DB.get_info('index_ref', where={ 'wordid': wordid}) # For ever entry in the list. for row in res: # Modify the current row to contain the stem word. row['word'] = self.manage_DB.get_info( 'index_word', rowid=row['wordid'])['word'] # Modify the current row to contain the document name. row['doc'] = self.manage_DB.get_info( 'document', rowid=row['docid'])['name'] # Return the list of all the results. return res # The supplied word does not exist in the index_word table, so return # and empty list. else: return []
def getDocStuff(dDocProps): lAllLists = [] if (constants.T in dDocProps): lAllLists.append(dDocProps[constants.T]) putinDPLace("1",dDocProps[constants.T]) if (constants.W in dDocProps): lAllLists.append(dDocProps[constants.W]) putinDPLace("2",dDocProps[constants.W]) if (constants.A in dDocProps): lAllLists.append(dDocProps[constants.A]) putinDPLace("3",dDocProps[constants.A]) lAllLines = [] for lList in lAllLists: lAllLines.extend(lList) lAllWords = [] for sLine in lAllLines: sLine = re.sub('[^a-zA-Z0-9]', ' ', sLine) lWords = sLine.lower().split() lAllWords.extend(lWords) lw = copy.deepcopy(lAllWords) lAllWords = helperFunctions.remStopWords(lAllWords) p = PorterStemmer() lAllWordsStemmed = [] for word in lAllWords: word = p.stem(word,0,len(word)-1) lAllWordsStemmed.append(word) lUniqueWords = list(set(lAllWordsStemmed)) lenAllWords = len(lAllWordsStemmed) constants.allDocsLen = constants.allDocsLen+lenAllWords sRet = helperFunctions.makeFixedLengthStr(len(lAllWordsStemmed),constants.docWordCntLen)+constants.space+helperFunctions.makeFixedLengthStr(len(lUniqueWords),constants.docWordCntLen)+constants.newLine return [sRet,lAllWordsStemmed," ".join(lw)]
def process_query(self, query): all_doc_count = len(self.invert.documents.keys()) query_array = [x.lower() for x in query.split(' ')] query_weights = {} stopwords = [] if self.stopword_toggle: stopwords = fetch_stopwords() while query_array: word = query_array.pop(0) frequency = 1 for a in [',', '.', '{', '}', '(', ')', ';', ':', '"', '\'']: if a in word: if word.index(a) == 0 or word.index(a) == len(word) - 1: word = word.replace(a, '') while word in query_array: query_array.pop(query_array.index(word)) frequency += 1 if self.stemming_toggle: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) if word in stopwords: continue term_weight = 0 if word in self.invert.termsDictionary.keys(): document_frequency = self.invert.termsDictionary[word] idf = math.log(all_doc_count / document_frequency) term_frequency = 1 + math.log(frequency) term_weight = idf * term_frequency query_weights[word] = term_weight return query_weights
def getDocStuff(dDocProps): global T,W,B,A,N,I lAllLists = [] if (T in dDocProps): lAllLists.append(dDocProps[T]) if (W in dDocProps): lAllLists.append(dDocProps[W]) #if (B in dDocProps): # lAllLists.append(dDocProps[B]) if (A in dDocProps): lAllLists.append(dDocProps[A]) #if (N in dDocProps): # lAllLists.append(dDocProps[N]) lAllLines = [] for lList in lAllLists: lAllLines.extend(lList) lAllWords = [] for sLine in lAllLines: lWords = sLine.split() lAllWords.extend(lWords) lAllWords = helperFunctions.remStopWords(lAllWords) p = PorterStemmer() lAllWordsStemmed = [] for word in lAllWords: word = p.stem(word,0,len(word)-1) lAllWordsStemmed.append(word) #print("All words :", lAllWordsStemmed,"\n") lUniqueWords = list(set(lAllWordsStemmed)) lenAllWords = len(lAllWordsStemmed) lenAllWords sRet = makeFixedLengthStr(len(lAllWordsStemmed),6)+" "+makeFixedLengthStr(len(lUniqueWords),6) #+":"+dDocProps[B][0] return [sRet,lAllWordsStemmed]
def main(argv): files = os.listdir(sys.argv[1]) file = files[0] stemmed = [] for file in files: text = "" infile = open(sys.argv[1] + file) a = infile.readline() while a: text += removeSGML(a) a = infile.readline() tok = tokenizeText(text) removed = removeStopwords(tok) from porter import PorterStemmer p = PorterStemmer() for element in removed: stemmed.append(p.stem(element, 0, len(element)-1)) print "Words " + str(len(stemmed)) unique = list(set(stemmed)) print "Vocabulary " + str(len(unique)) wordfrequency = [(unique[x], stemmed.count(unique[x])) for x in range(0,len(unique))] sort = sorted(wordfrequency, key = getKey, reverse = True) for i in range(0,49): print sort[i]
print("dataset imported") import re import nltk nltk.download('stopwords') # to remove stopword from nltk.corpus import stopwords # for Stemming propose #from nltk.stem.porter import PorterStemmer from porter import PorterStemmer p = PorterStemmer() p.stem("Alcoholic") # Initialize empty array # to append clean text corpus = [] for i in range(0, 1000): # column : "Review", row ith review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # convert all cases to lower cases review = review.lower() review = review.split() ps = PorterStemmer() # loop for stemming each word # in string array at ith row review = [
dataFile = open("temp1.txt", "w") comDataFile = open("com.txt", "w") testFile = open("output.txt", "w") stopWord=open("stopwords.txt").read() stopWord=stopWord.split("\n") stemmer= PorterStemmer() countPo=0 countSo=0 trainingSet=f.readlines() testingSet=v.readlines() trainingSet=trainingSet testingSet=testingSet #initialize the stemmer object for (optional) stemming later stemmer= PorterStemmer() stopWord=stemmer.stem(stopWord,0,len(stopWord)-1) def getCleanString(string): """ fix the string for best results the cleaning involve ( remove all the special character except _ and - , convert upper case to lower case letter stemmering "remove [word with]ing|s|ed...etc" ) """ string=re.sub(r'([^a-zA-Z\-\_])|(http.+)','',string) string=string.lower() string=stemmer.stem(string,0,len(string)-1) return string;
def stem(word): p = PorterStemmer() return p.stem(word, 0, len(word) - 1)
def test(): stem = input("Was the stemmer used in the inversion? (Y/N)") return_times = [] g = open("postings.txt", "r") content = g.read().replace('\n', ' ') post_list = json.loads("[" + content[:-2] + "]") h = open("cacm.all", "r") lines = h.readlines() if g.mode == 'r' and h.mode == 'r': word = "" while word != "zzend": word = input("Enter a term to search for: ").lower() if stem == "Y": p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) found_word = False start = timer() for elem in post_list: if word == elem[0]: found_word = True print("\nThis term is found in " + str(len(elem[1])) + " documents.") print( "=============================================================================" ) break if found_word: print( "This search term is found in the following documents:\n") # output all docs that contain that term: DocID, title, TF, all the positions, first occurrence with 10 # words docdata = [] for entry in post_list: if entry[0] == word: docdata += entry[1] break # docdata now has doc ID, TF, and positions for each document input_txt appears in # now search in cacm for word data count = 0 get_title = False abstract_bool = False abstract_text = "" title = "" output = "" found = False for line in lines: if count == len(docdata): break if line.startswith(".I " + str(docdata[count][0])): found = True if line == ".B\n" and found: get_title = False abstract_bool = False found = False # I need to create the output string here, as its all going to be reset now. output += "Document " + str(docdata[count][0]) + " - " + title + "Term frequency: " + \ str(docdata[count][1]) + "\nList of positions: " + str(docdata[count][2]) + \ "\nFirst occurrence in document: " + \ getcontext(title + abstract_text, docdata[count][2][0]) + "\n" + "------------" + "\n" title = "" abstract_text = "" count += 1 if abstract_bool: abstract_text += line if line == ".W\n" and found: get_title = False abstract_bool = True if get_title: title += line if line == ".T\n" and found: get_title = True end = timer() elapsed_time = (end - start) if found_word: return_times += [elapsed_time] print(output) print("Search time: " + str(elapsed_time) + " seconds\n") # output time to results elif word != "zzend": print("Term not found in any documents") shutdown(return_times) g.close() h.close() else: print("Error opening file. Try again.")
def processEmail(email_contents): #PROCESSEMAIL preprocesses a the body of an email and #returns a list of word_indices # word_indices = PROCESSEMAIL(email_contents) preprocesses # the body of an email and returns a list of indices of the # words contained in the email. # # Load Vocabulary vocab = getVocabDict() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find('\n\n') # email_contents = email_contents[hdrstart+2:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > # and does not have any < or > in the tag and replace it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print '\n==== Processed Email ====\n' # Process file l = 0 porterStemmer = PorterStemmer() # Tokenize and also get rid of any punctuation sep = '[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\},\'\"\>\_\<\;\%\n\r]+' for s in re.split(sep, email_contents): # Remove any non alphanumeric characters s = re.sub('[^a-zA-Z0-9]', '', s) # Stem the word s = porterStemmer.stem(s.strip()) # Skip the word if it is too short if len(s) < 1: continue # Look up the word in the dictionary and add to word_indices if # found # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable s. You should look up s in the # vocabulary dictionary (vocab). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if s = 'action', then you should # add to word_indices the value under the key 'action' # in vocab. For example, if vocab['action'] = 18, then, # you should add 18 to the word_indices vector # (e.g., word_indices.append(18) ). # # ============================================================= # Print to screen, ensuring that the output lines are not too long if l + len(s) + 1 > 78: print l = 0 print s, l += len(s) + 1 # Print footer print '\n=========================' return array(word_indices)
dataFile = open("temp1.txt", "w") comDataFile = open("com.txt", "w") testFile = open("output.txt", "w") stopWord = open("stopwords.txt").read() stopWord = stopWord.split("\n") stemmer = PorterStemmer() countPo = 0 countSo = 0 trainingSet = f.readlines() testingSet = v.readlines() trainingSet = trainingSet testingSet = testingSet #initialize the stemmer object for (optional) stemming later stemmer = PorterStemmer() stopWord = stemmer.stem(stopWord, 0, len(stopWord) - 1) def getCleanString(string): """ fix the string for best results the cleaning involve ( remove all the special character except _ and - , convert upper case to lower case letter stemmering "remove [word with]ing|s|ed...etc" ) """ string = re.sub(r'([^a-zA-Z\-\_])|(http.+)', '', string) string = string.lower() string = stemmer.stem(string, 0, len(string) - 1)
class Tokenizer(): def __init__(self, PATH_TO_STOP_WORDS): print("[Tokenizer] Instantiated!") self.PATH_TO_STOP_WORDS = PATH_TO_STOP_WORDS self.STOP_WORDS = self.load_stopwords() self.PorterStemmer = PorterStemmer() """ Tokenizes on these rules: SPLIT INTO TOKENS STRIP TOKENS' WHITESPACES, NEWLINES AND PUNCTUATIONS DANGLING IN BETWEEN STEM EVERY TOKEN REMOVE TOKEN IF IS STOP WORD Returns list of text normalized tokens """ def tokenize(self, input_str): result = [] # input_str_list = input_str.split() input_str_list = re.split('\W+', input_str) for token in input_str_list: result_tok = token.strip(PUNCTUATIONS) if len(result_tok) > 1 and \ not self.is_stopword(result_tok.lower()) and \ not self.isMixedNumeric(result_tok): result_tok = self.stem(result_tok) result.append(result_tok.lower()) return result def stem(self, word): return self.PorterStemmer.stem(word, 0, len(word) - 1) def remove_stopwords(self, tokens): return list(filter(lambda tok: tok not in self.STOP_WORDS, tokens)) def is_stopword(self, token): return self.STOP_WORDS.get(token) #===========================================================================# # STRING MANIPULATION FUNCS #===========================================================================# """ Split on 1st whitespace from back """ def split_on_whitespace_from_back(self, input_str): return input_str.rsplit(' ', 1) """ Split on 1st '/' from back """ def split_on_slash_from_back(self, input_str): return input_str.rsplit('/', 1) """ Trim newline char from an input string """ def strip_newline(self, input_str): return input_str.strip('\n') """ Determines whether an input string has the RegEx given in this function A RegEx match object will be returned if a complete match occurs """ def isMixedNumeric(self, input_str): pattern = r'([0-9]+[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]*)+' return re.match(pattern, input_str) #===========================================================================# # SETUP #===========================================================================# def load_stopwords(self): f = open(self.PATH_TO_STOP_WORDS, 'r') stopwords = f.read().splitlines() stopword_dict = {} for word in stopwords: stopword_dict[word] = True return stopword_dict
def lookup(user_input, CLI, K): use_stem = False stop_words = False g = open("postings.txt", "r") f = open("cacm.all", "r") content = g.read().replace('\n', ' ') if content[0] == "1": use_stem = True if content[1] == "1": stop_words = True post_list = json.loads("[" + content[2:-2] + "]") lines = f.readlines() f.close() extracted_postings = [] docs = [] final_list = [] if g.mode == 'r': # get query og_query = user_input.lower() og_query = re.sub('[\-]+', ' ', og_query) og_query = re.sub('[^A-Za-z0-9$ ]+', '', og_query) newquery = og_query.split() if stop_words: temp = [] stop_words = open("stopwords.txt", "r").read().split('\n') for i in range(len(stop_words)): stop_words[i] = stop_words[i].lower() for word in newquery: if word not in stop_words: temp.append(word) newquery = temp if use_stem: stemmed_query = "" for word in newquery: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) stemmed_query += word + " " newquery = stemmed_query.split() newquery.sort() term_list = get_term_lists(newquery, post_list) # remove duplicates if they exist term_list = list(dict.fromkeys(term_list)) for entry in term_list: extracted_postings.append(post_list[entry]) # get docs out of extracted postings for posting in extracted_postings: for entry in posting[1]: docs.append(entry[0]) docs = list(dict.fromkeys(docs)) docs.sort() document_vectors = get_doc_vector(docs, lines, use_stem, stop_words) # print("Relevant document vectors created. Now calculating cosine similarity") # now, make all of those vectors have tf values, and then weights cosine_list = fill_vectors(document_vectors, og_query, docs) temp_list = [] for i in range(len(docs)): temp_list.append([docs[i], cosine_list[i]]) temp_list.sort(key=lambda x: x[1]) temp_list.reverse() if CLI: print("Query was: " + user_input + "\n") display(temp_list, get_doc_info(docs, lines)) for elem in temp_list: final_list.append(elem[0]) if K is None: return final_list else: return final_list[:K]
class InfoRetrieval: def __init__(self): self.stemmer = PorterStemmer() self.data = [] self.db = JokerDatabase() self.db.connect() self.total_docs = 0 def restore_persisted_state(self): state = self.db.restore_state() self.total_docs = len(state.docs) self.data.append(state) self.calculate_idfs(state) def stem_words(self, data): for key, value in data.docs.iteritems(): stemmed_words = [] for word in value.words: stemmed_words.append(self.stemmer.stem(word, 0, len(word)-1)) value.words = stemmed_words def calculate_query_idf(self, query): idf_dict = {} total = self.total_docs for word in query.split(' '): doc_ct = 0 for data2 in self.data: for key2, value2 in data2.docs.iteritems(): for word2 in value2.words: if word2 == word: doc_ct += 1 break if doc_ct == 0: idf_dict[word] = 0 else: idf_dict[word] = float(math.log((int(total)/int(doc_ct)), 2)) return idf_dict def calculate_query_tf(self, query): freq_dict = {} tf_dict = {} total_words = len(query.split(' ')) for word in query.split(' '): freq_dict[word] = 0 for word in query.split(' '): freq_dict[word] += 1 for key, value in freq_dict.iteritems(): freq_dict[key] = value / total_words return freq_dict def calculate_idfs(self, data): total = self.total_docs for key1, value1 in data.docs.iteritems(): for word1 in value1.words: doc_ct = 0 for data2 in self.data: for key2, value2 in data2.docs.iteritems(): for word2 in value2.words: if word2 == word1: doc_ct += 1 break value1.terms_idf[word1] = math.log10(int(total)/int(doc_ct)) def do_clear(self): for dfile in self.data: for key in dfile.docs.iterkeys(): self.db.remove_id(key) def do_print(self, docid): found = False for dfile in self.data: if dfile.docs.has_key(str(docid)): found = True print dfile.docs[str(docid)].document.text if found == False: print "Document not found." def do_read(self, filename): data = JokerData(filename) data.parse_docs() self.stem_words(data) self.data.append(data) count = 0 for dfile in self.data: for doc in dfile.docs: count += 1 self.total_docs = count self.calculate_idfs(data) self.db.persist_docs(data) def do_list(self): index = 0 for dfile in self.data: print index, ":", dfile.filename for key in dfile.docs.iterkeys(): print " ", key def do_show(self, docid): found = False for dfile in self.data: if dfile.docs.has_key(str(docid)): found = True print "\nWords:" print dfile.docs[str(docid)].words print "\nTerm Freqs:" print dfile.docs[str(docid)].terms_freq print "\nIDFs:" print dfile.docs[str(docid)].terms_idf if found == False: print "Document not found." def do_sim(self, docID1, docID2): doc1 = 0 doc2 = 0 for dfile in self.data: for key, value in dfile.docs.iteritems(): if key == docID1: doc1 = value elif key == docID2: doc2 = value if doc1 == 0 or doc2 == 0: print "Error invalid docID" return doc1_wgts = doc1.tf_idf() doc2_wgts = doc2.tf_idf() sim = 0 for key1, value1 in doc1_wgts.iteritems(): for key2, value2 in doc2_wgts.iteritems(): if key2 == key1: sim += value1 sim += value2 print "Sim: ", sim return sim def do_search(self, query): tfs = self.calculate_query_tf(query) idfs = self.calculate_query_idf(query) tf_idfs = {} for key, value in tfs.iteritems(): tf_idfs[key] = value * idfs[key] sims = {} for dfile in self.data: for key, value in dfile.docs.iteritems(): sims[key] = self.query_similarity(tf_idfs, value) sorted_sims = sorted(sims.iteritems(), key=operator.itemgetter(1), reverse=True) for pair in sorted_sims: if pair[1] > 0: print " ", pair[0], ":", pair[1] def do_search_doc(self, docid): sims = {} for dfile in self.data: for key, value in dfile.docs.iteritems(): sims[key] = self.do_sim(docid, key) sorted_sims = sorted(sims.iteritems(), key=operator.itemgetter(1), reverse=True) print "Most relevant documents:" for pair in sorted_sims: if pair[1] > 0: print " ", pair[0], ":", pair[1] def query_similarity(self, query_wgt, doc): doc_wgts = doc.tf_idf() sim = 0 for key, value in query_wgt.iteritems(): if doc_wgts.has_key(key): sim += doc_wgts[key] return sim def do_read_list(self, lst): myf = open(self.filename, 'r') for line in myf.readlines(): self.do_read(line) myf.close() def do_quit(self): return None def show_consol(self): values = { 'clear' : self.do_clear, 'print' : self.do_print, 'read' : self.do_read, 'list' : self.do_list, 'read_list' : self.do_read_list, 'show' : self.do_show, 'sim' : self.do_sim, 'search' : self.do_search, 'search_doc': self.do_search_doc, 'quit' : self.do_quit } while True: self.show_menu() try: choice = sys.stdin.readline() except KeyboardInterrupt: break current_opt = choice.replace('\n', '').split(' ') if not values.has_key(current_opt[0].lower()): continue func = values[current_opt[0].lower()] if current_opt[0] == 'quit': return elif current_opt[0] == 'search' and "\"" in choice: cs = choice.split('"') func(cs[1]) elif len(current_opt) == 3: func(current_opt[1], current_opt[2]) elif len(current_opt) == 2: func(current_opt[1]) elif len(current_opt) == 1: func() def show_menu(self): print "Document Collection Options:\n -CLEAR\n -PRINT <docID>\n -SHOW <docID>\n -READ <filename>\n -READ_LIST <list>\n -SIM <docID> <docID>\n -SEARCH_DOC <docID>\n -SEARCH <query>\n -QUIT"
class SentenceSplitter(object): """ Разбиваем отдельные предложения на токены, токены стеммируем. При этом сохраняется структура текста, т.е. абзацы. На вход принимаем список предложений, на выходе возвращаем список стем в виде [[[],[],[]],[[],[]]], где второй уровень вложенности - это абзацы, третий - сами преложения. """ def __init__(self, stopwords, VERBTRANSFORMS, NOUNTRANSFORMS, lexicon_de, language): self.language = language self.stopwords = stopwords self.VERBTRANSFORMS = VERBTRANSFORMS self.NOUNTRANSFORMS = NOUNTRANSFORMS # знаки, которые будут удаляться в начале и конце токена self.punctuation = "∙!‼¡\"#£€$¥%&'()*+±×÷·,-./:;<=>?¿@[\]^ˆ¨_`—–{|}~≈≠→↓¬’“”«»≫‘…¦›🌼′″¹§¼⅜½¾⅘©✩✒•►●★❤➡➜➚➘➔✔➓➒➑➐➏➎➍➌➋➊❸❷■†✝✌️³²‚„ " # для разбивки на токены по пробелам и слешам self.splitchars = re.compile(r'[\s\\\/\(\)\[\]\<\>\;\:\,\‚\—\?\!\|\"«»…#]|\.\.\.+|[ �⌂ ∞½¾►=]|\-\-|\.[\'\"’“”«»‘′″„-]') if self.language == 'ru': self.stemmer = RussianStemmer() # объект pymorphy2.MorphAnalyzer(), будем использовать атрибут normal_form self.lemmatizer_ru = pymorphy2.MorphAnalyzer() self.normalizer = NormalizerRU() elif self.language == 'de': self.stemmer = GermanStemmer() self.normalizer = NormalizerDE() self.lexicon_de = lexicon_de else: self.stemmer = PorterStemmer() self.normalizer = NormalizerEN() def tokenizeString(self, sentence): """ Функция последовательной обработки каждого слова. Получает на вход предложение, создает список tokens, складывает туда выделенные re.split'ом слова, 'отрезая' пунктуацию с концов слова и понижая регистр, и удаляет по ходу окончания-сокращения функцией del_contractions. Дальше заменяет неправильные формы глаголов и сущ-х правильными (и расставляет теги определённых маркеров). """ # генератор списка токенов: по циклу: разбиваем строку на токены по regexp splitchars, # 2. удаляем знаки вокруг токена, приводим к нижнему регистру, if self.language == 'ru': tokens = (self.normalizer.normalizeLetters(token.strip(self.punctuation).lower()) for token in self.splitchars.split(sentence)) elif self.language == 'de': tokens = (self.normalizer.normalizeLetters(self.normalizer.deleteContrs(token.strip(self.punctuation).lower())) for token in self.splitchars.split(sentence)) else: tokens = (self.normalizer.token_transform(self.normalizer.del_contractions(token.strip(self.punctuation).lower()), self.VERBTRANSFORMS, self.NOUNTRANSFORMS) for token in self.splitchars.split(sentence)) return tokens def tokenizeWithCase(self, sentence): """ Такая же функция токенизации, только без приведения слов к нижнему регистру """ # генератор списка токенов: по циклу: разбиваем строку на токены по regexp splitchars, # 2. удаляем знаки вокруг токена, приводим к нижнему регистру if self.language == 'ru': tokens = (self.normalizer.normalizeLetters(token.strip(self.punctuation)) for token in self.splitchars.split(sentence)) tokens_with_case = [token for token in tokens if token.lower() not in self.stopwords] elif self.language == 'de': tokens = (self.normalizer.normalizeLetters(self.normalizer.deleteContrs(token.strip(self.punctuation))) for token in self.splitchars.split(sentence)) tokens_with_case = [token for token in tokens if token.lower() not in self.stopwords] else: tokens = (self.normalizer.token_transform(self.normalizer.del_contractions(token.strip(self.punctuation)), self.VERBTRANSFORMS, self.NOUNTRANSFORMS) for token in self.splitchars.split(sentence)) tokens_with_case = [token for token in tokens if token.lower() not in self.stopwords] return tokens_with_case def stemTokens(self, sentence): """ Функция формирует список стеммированных терминов с удалением стоп-слов. Возвращает список кортежей, в которых содержатся стеммы значимых слов и сами слова. (Это необходимо для последующего извлечения ключевых слов) """ # генератор списка терминов: если термин не в списке стоп-слов, то стеммируем его. if self.language == 'ru': stemmed_sentence = ((self.stemmer.stem(self.lemmatizer_ru.parse(term)[0].normal_form), term) for term in self.tokenizeString(sentence) if term not in self.stopwords) elif self.language == 'de': stemmed_sentence = ((self.stemmer.stem(self.normalizer.lemmatize(term, self.lexicon_de)), term) for term in self.tokenizeString(sentence) if term not in self.stopwords) else: stemmed_sentence = ((self.stemmer.stem(term, 0, len(term)-1), term) for term in self.tokenizeString(sentence) if term not in self.stopwords) if not stemmed_sentence: return [] else: return stemmed_sentence def tokenizeListParagraphs(self, list_of_sentences): """ Получает список предложений, сгруппированных по абзацам. Каждое слово из списка стеммированных токенов складывает в новый список с сохранением структуры абзацев. """ tokenized_sentences = [] for sentences in list_of_sentences: terms_list = [] for s in sentences: terms_in_sentence = [] for term_pair in self.stemTokens(s): if len(term_pair[0]) > 0: terms_in_sentence.append(term_pair) terms_list.append(terms_in_sentence) tokenized_sentences.append(terms_list) return tokenized_sentences def tokenizeListSentences(self, list_of_sentences): """ Получает список предложений. (без абзацев) """ tokenized_sentences = [] for s in list_of_sentences: terms_in_sentence = [] for term_pair in self.stemTokens(s): if len(term_pair[0]) > 0: terms_in_sentence.append(term_pair) tokenized_sentences.append(terms_in_sentence) return tokenized_sentences def tokenizeSentencesWithCaseKeeping(self, list_of_sentences): """ Получает список предложений с сохранением регистра. (без абзацев) """ tokenized_sentences = [] for s in list_of_sentences: terms_in_sentence = [] for term in self.tokenizeWithCase(s): if len(term) > 0: terms_in_sentence.append(term) tokenized_sentences.append(terms_in_sentence) return tokenized_sentences
class BuildTermSpace(object): """ Создание json-объекта в виде словаря, в котором хранятся стеммы значимых слов и их частотность из указанных корпусов. """ def __init__(self, language='en', action='tfidf'): # Вызываем LoadExternalLists, создаем список стоп-слов, # загружаем немецкий лексикон, # self.language = language self.action = action # знаки, которые будут удаляться в начале и конце токена self.punctuation = "∙!‼¡\"#£€$¥%&'()*+±×÷·,-./:;<=>?¿@[\]^ˆ¨_`—–{|}~≈≠→↓¬’“”«»≫‘…¦›🌼′″¹§¼⅜½¾⅘©✒•►●★❤➡➜➚➘➔✔➓➒➑➐➏➎➍➌➋➊❸❷■†✝✌️³²‚„ " loadRes = LoadExternalLists() if self.language == 'de': self.stopwords = loadRes.loadStopWordsDE() # объект стеммера self.stemmer = GermanStemmer() # немецкий словарь print '\n', "Loading German Dictionary... OK", '\n' self.lexicon_de = loadRes.loadLexiconDe() self.normalizer = NormalizerDE() elif self.language == 'ru': self.stopwords = loadRes.loadStopWordsRU() self.stemmer = RussianStemmer() # объект pymorphy2.MorphAnalyzer(), будем использовать атрибут normal_form self.lemmatizer_ru = pymorphy2.MorphAnalyzer() self.normalizer = NormalizerRU() else: self.stopwords = loadRes.loadStopWordsEN() self.stemmer = PorterStemmer() self.normalizer = NormalizerEN() # список неправ. гл. self.irreg_verbs = loadRes.loadVerbForms() # список неправ. сущ-х self.irreg_nouns = loadRes.loadNounforms() def processString(self, line): """ Функция последовательной обработки каждого слова. Получает на вход строку, создает список tokens, складывает туда выделенные re.split'ом слова, 'отрезая' пунктуацию с концов слова и понижая регистр, и удаляет по ходу окончания-сокращения функцией del_contractions Дальше переходит к формированию списка стеммированных терминов rslt_list с удалением стоп-слов и цифровых последовательностей. Возвращает список rslt_list, в котором содержатся только стеммы значимых слов. """ # для разбивки на токены по пробелам и слешам splitchars = re.compile(r'[\s\\\/\(\)\[\]\<\>\;\:\,\‚\—\?\!\|\"«»…#]|\.\.\.+|[ �⌂ ∞½¾►=]|\-\-|\.[\'\"’“”«»‘′″„-]') # [\.\:][\'\"’“”«»‘′″] # для игнорирования токенов, содержащих цифры esc_num = re.compile(r'[0-9]+') # для игнорирования URL #url_esc = re.compile(r'([a-z]{3,6}:\/\/)?([a-zA-Z0-9\-@?]+[\.|\:])+[a-z]{2,13}[\.\?\=\&\%\,\#\+\(\)\/\w\-]*') if self.language == 'de': tokens = (self.normalizer.normalizeUmlaut(self.normalizer.deleteContrs(token.strip(self.punctuation).lower())) for token in splitchars.split(line)) rslt_list = (self.stemmer.stem(self.normalizer.lemmatize(term, self.lexicon_de)) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0) # and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами elif self.language == 'ru': tokens = (self.normalizer.normalizeE(token.strip(self.punctuation).lower()) for token in splitchars.split(line)) rslt_list = (self.stemmer.stem(self.lemmatizer_ru.parse(term)[0].normal_form) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0) # and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами else: # генератор списка токенов: по циклу: разбиваем строку на токены по regexp splitchars, # 2. удаляем знаки вокруг токена, приводим к нижнему регистру, # 3. трансформируем форму неправ. глаг. в правильную # 4. удаляем окончания-сокращения с \' tokens = (self.normalizer.token_transform(self.normalizer.del_contractions(token.strip(self.punctuation).lower()), self.irreg_verbs, self.irreg_nouns) for token in splitchars.split(line)) # генератор списка терминов: если термин не в списке стоп-слов и не содержит цифр, то стеммируем его. rslt_list = (self.stemmer.stem(term, 0, len(term)-1) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0) # and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами if not rslt_list: return [] else: return rslt_list def processFile(self, filename): """ Читает файл в utf-16, для каждой строки файла вызывает функцию processString, каждое слово из получившегося списка добавляет в set terms_set, избавляясь от дубликатов. Возвращает список уникальных лемм. """ terms_set = set() terms_list = [] if self.action == 'tfidf': try: with codecs.open(filename, 'r', 'utf-16') as infile: for line in infile: if len(line) > 1: for term in self.processString(line): terms_set.add(term) except (UnicodeDecodeError, UnicodeError, IOError): pass return terms_set if self.action == 'raw': try: with codecs.open(filename, 'r', 'utf-16') as infile: for line in infile: if len(line) > 1: for term in self.processString(line): terms_list.append(term) except (UnicodeDecodeError, UnicodeError, IOError): pass return terms_list def crawl(self, dirname): """ Функция проходит по папкам и подпапкам указанной в качестве аргумента директории. Проверяет, если файл текстовый, то запускает функцию processFile и складывает результат её работы в set terms_set. В общем terms_dict подсчитывается частотность каждой леммы, словарь сохраняется как json. terms_dict отражает по сути вторую часть формулы tfidf, т.е. показывает в каком количестве документов встретился термин. """ docs_num = 0 terms_dict = defaultdict(int) for root, dirs, files in os.walk(dirname): print root, "processing..." for filename in files: if filename.endswith('.txt') or filename.endswith('.TXT'): print filename terms_set = self.processFile(join(root,filename)) for term in terms_set: terms_dict[term] += 1 docs_num+=1 if self.action == 'raw': with codecs.open(r'.\termSpace\\'+self.language.upper()+'frequency_list_stem.txt', 'w', 'utf-16') as outfile: for key, value in sorted(terms_dict.iteritems(), key=lambda x:x[1], reverse=True): outfile.write(key+'\t'+str(value)) outfile.write('\n') if self.action == 'tfidf': with open(r".\termSpace\\" + self.language.upper() + "CorpusDict_" + str(docs_num) + ".json", 'w') as outfile: json.dump(terms_dict, outfile)
# MIT Licensed # Copyright 2014 REM <*****@*****.**>. from pymongo import MongoClient from pymongo import DESCENDING import utility from porter import PorterStemmer p = PorterStemmer() client = MongoClient('localhost', 27017) db = client.uberly clt = db.uber_vocab_1 #db.uber_dictionary.find().limit(50).sort({value:-1}).pretty() for entry in clt.find().sort([('value', DESCENDING)]): entry['stem'] = p.stem(entry['_id'], 0,len(entry['_id'])-1) clt.save(entry) print(entry['_id']), entry['stem']
class Preprocessor: ''' Constructor. Initializes object with pre-compiled regexes. ''' def __init__(self): self.spec_chars_regex = re.compile('[^0-9a-zA-Z]') self.camel_case_regex_1 = re.compile('(.)([A-Z][a-z]+)') self.camel_case_regex_2 = re.compile('([a-z0-9])([A-Z])') self.stemmer = PorterStemmer() # from Gupta's Porter Stemmer ''' removeSpecialChars(self, string) Returns a copy of string with all non-alphanumeric characters replaced by whitespace. ''' def removeSpecialChars(self, string): return self.spec_chars_regex.sub(' ', string) ''' splitCamelCase(self, string) Returns a copy of string with each camelCase word split into separate words with whitespace in between. ''' def splitCamelCase(self, string): newString = self.camel_case_regex_1.sub(r'\1 \2', string) return self.camel_case_regex_2.sub(r'\1 \2', newString) ''' porterStem(self, words) Returns a list of each word in words that have been stemmed by the Porter Stemmer. ''' def porterStem(self, words): stems = [""] * len(words) return [self.stemmer.stem(word, 0, len(word) - 1) for word in words] ''' preprocess(self, string) Replaces all special (non-alphanumeric) characters in string with whitespace, splits all camelCase words in string into separate words with whitespace in between, splits all remaining words into a list of all words (in lowercase) found in the edited string, and returns a list of all the words' stems via the Porter Stemmer. ''' def preprocess(self, string): newString = self.removeSpecialChars(string) newString = self.splitCamelCase(newString) words = newString.lower().split() return self.porterStem(words) ''' prepDoc(self, doc) Opens the document specified as doc, preprocesses each line and returns a list of arrays containing the preprocessed lines of the document. ''' def prepDoc(self, doc, combine=False): texts = [] with open(doc) as file: for line in file: prepLine = self.preprocess(line) if len(prepLine) > 0: texts.append(prepLine) if combine == False: return texts else: return self.combineVectors(texts) ''' combineVectors(self, texts) Turns a list of vectors into one long vector, and returns this long vector. ''' def combineVectors(self, texts): newTexts = [] for text in texts: newTexts += text return newTexts
elif (query < lData[0]): print (query+ " < "+lData[0]) return search(query,index+1,middle,m1) elif (query > lData[0]): print (query+ " > "+lData[0]+" so go to index : "+str(index+middle)) return search(query,index+middle,middle,m1) else: return [] print(" size : "+str(int(m1.size()/35))) p = PorterStemmer() query= sys.argv[1] query = p.stem(query,0,len(query)-1) lBytes = search(query,0,int(m1.size()/35),m1) print("lBytes : ",lBytes) if(lBytes != None): termFile = "./indexes/terms.txt" with open(termFile, "r+b") as f: # memory-map the file, size 0 means whole file map = mmap.mmap(f.fileno(), 0) print("Term stuff : ",map[int(lBytes[0]):int(lBytes[1])]) else: print("Not found") ''' termsListFile = "./indexes/termsList.txt" fl = open(termsListFile,"r")
# Quick example of the stemming from porter import PorterStemmer from lancaster import LancasterStemmer porter = PorterStemmer() lancaster = LancasterStemmer() words = [ "dogs", "cats", "cafes", "shops", "bookshops", "bars", "cafe", "columbia", "coffee", "coffees", "outdoors" ] for w in words: print("Word: {0}".format(w)) print("Stem: {0}".format(porter.stem(w))) print("======")