def tokenize_on_porter(text): word_list = [] p = PorterStemmer() outfile = open('out3', 'w') for line in text.splitlines(): output = '' word = '' if line != '': for c in line: if c.isalpha(): word += c.lower() else: if word: word_stem = p.stem(word, 0, len(word) - 1) output += word_stem word_list.append(word_stem) word = '' output += c.lower() print(output, end='\n', file=outfile) outfile.close() return word_list
def __init__(self): ''' if phrase_dict_json != None: extract the phrase features if subtype_flag = True, extract the features by sub parse_type if bioe_flag = True, use the BIOE tags ''' self.features = [ 'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color' ] if 'pos' in self.features: self.pos_tagger = SennaTagger(global_params.sennadir) if 'chunk' in self.features: self.chunk_tagger = SennaChunkTagger(global_params.sennadir) self.sentences = [] self.porter = PorterStemmer() self.token_dict = None self.bins = 50
def build(self, docpath, outfile): p = PorterStemmer() sw = stopwords.StopWords(self.stopword_file) ndx = defaultdict(list) for filename in os.listdir(docpath): if not filename.endswith(".txt"): continue doc_id = hash(filename.replace(".txt", "")) with open(os.path.join(docpath, filename)) as f: f_content = kwutils.normalize(f.read().lower()) words = kwutils.tokenize(f_content) w_stemmed = kwutils.stem(words, p) w_stopped = kwutils.filter_stopwords(w_stemmed, sw) for word in w_stopped: if len(word) > 0: if not doc_id in ndx[word]: ndx[word].append(doc_id) with open(outfile, 'w') as f: f.write(json.dumps(ndx))
def process_query(self, query): all_doc_count = len(self.invert.documents.keys()) query_array = [x.lower() for x in query.split(' ')] query_weights = {} stopwords = [] if self.stopword_toggle: stopwords = fetch_stopwords() while query_array: word = query_array.pop(0) frequency = 1 for a in [',', '.', '{', '}', '(', ')', ';', ':', '"', '\'']: if a in word: if word.index(a) == 0 or word.index(a) == len(word) - 1: word = word.replace(a, '') while word in query_array: query_array.pop(query_array.index(word)) frequency += 1 if self.stemming_toggle: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) if word in stopwords: continue term_weight = 0 if word in self.invert.termsDictionary.keys(): document_frequency = self.invert.termsDictionary[word] idf = math.log(all_doc_count / document_frequency) term_frequency = 1 + math.log(frequency) term_weight = idf * term_frequency query_weights[word] = term_weight return query_weights
poslineedited = [] neglinesedited = [] #there are total 6397 positives and negatives. poslinesTrain= poslines[:3201] neglinesTrain= neglines[:3196] priorknowledgepo = [] priorknowledgeneg = [] priorknowledgeneg= 3196/ 6397 priorknowledgepo = 3201/ 6397 stemmer = PorterStemmer() model = open('F:/ifa/NaiveBayes/model_file.csv', 'w',encoding="utf8") trainset= [(x,1) for x in poslinesTrain] + [(x,-1) for x in neglinesTrain] poswords={} #this dictionary stores counts for every word in positives negwords={} #and negatives for line,label in trainset: words= getwords(line) for word in words: word.lower() #increment the counts for this word based on the label #the .get(x, 0) method returns the current count for word #x, of 0 if the word is not yet in the dictionary
import re from porter import PorterStemmer p = PorterStemmer() def lcase(text): return text.lower() def prefixes(text): return [text[:3], text[:4], text[:5]] def suffixes(text): return [text[-3:], text[-4:], text[-5:]] def stem(text): if text.isalpha(): return p.stem(text.lower(), 0, len(text) - 1) return text def is_pair_of_digits(text): if re.match("^[0-9]{2}$", text): return True return False def is_four_digits(text): if re.match("^[0-9]{4}$", text):
def stem(word): p = PorterStemmer() return p.stem(word, 0, len(word) - 1)
def __init__(self): self.p = PorterStemmer() self.sw = stopwords.StopWords(self.stopword_file) self.re_tag = RE_TAG self.index = None
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t') print("dataset imported") import re import nltk nltk.download('stopwords') # to remove stopword from nltk.corpus import stopwords # for Stemming propose #from nltk.stem.porter import PorterStemmer from porter import PorterStemmer p = PorterStemmer() p.stem("Alcoholic") # Initialize empty array # to append clean text corpus = [] for i in range(0, 1000): # column : "Review", row ith review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # convert all cases to lower cases review = review.lower() review = review.split() ps = PorterStemmer() # loop for stemming each word # in string array at ith row
def train(): poslines = [] neglines = [] stopwords= open(r'stopwords.txt', 'r').read().splitlines() dataset= open('training_set.csv', 'r',encoding="utf8") dataset.readline() poslines=[] neglines=[] for data in dataset: data.lower() datalines = data.split(",")[1].strip('"').split(' ') DataClass = data.split(",")[0] #tokenizing the sentence if int(DataClass)==0: poslines.append(datalines) if int(DataClass)==1: neglines.append(datalines) else: continue print( "The total positive words are:", len(poslines)) print ("The total negative words are: ", len(neglines)) poslineedited = [] neglinesedited = [] #there are total 6397 positives and negatives. poslinesTrain= poslines[:3201] neglinesTrain= neglines[:3196] priorknowledgepo = [] priorknowledgeneg = [] priorknowledgeneg= 3196/ 6397 priorknowledgepo = 3201/ 6397 stemmer = PorterStemmer() model = open('model_file.csv', 'w',encoding="utf8") trainset= [(x,1) for x in poslinesTrain] + [(x,-1) for x in neglinesTrain] poswords={} #this dictionary stores counts for every word in positives negwords={} #and negatives for line,label in trainset: words= getwords(line) for word in words: word.lower() #increment the counts for this word based on the label #the .get(x, 0) method returns the current count for word #x, of 0 if the word is not yet in the dictionary if label==1: poswords[word]= poswords.get(word, 0) + 1 if label==-1: negwords[word]= negwords.get(word, 0) + 1 positivewordlist = open(r'positive-words.txt', 'r').read().splitlines() negativewordlist = open(r'negative-words.txt', 'r').read().splitlines() #evaluate the test set testset= open('test_set.csv', 'r',encoding="utf8") testset.readline() #make predictions output = open("prediction_file.csv", 'w') for line in testset: linesplit = line.split() testwords= getwords(linesplit) totpos, totneg= 0.0, 0.0 for word in testwords: word.lower() a= poswords.get(word,0.0) + 1.0 b= negwords.get(word,0.0) + 1.0 totpos+= a/(a+b) totneg+= b/(a+b) model.write("Word: " +str(word) + ",") model.write("Relative positive usage: " + str(totpos)+ ",") model.write("Relative negative usage: "+str(totneg)+ '\n')
def test(): stem = input("Was the stemmer used in the inversion? (Y/N)") return_times = [] g = open("postings.txt", "r") content = g.read().replace('\n', ' ') post_list = json.loads("[" + content[:-2] + "]") h = open("cacm.all", "r") lines = h.readlines() if g.mode == 'r' and h.mode == 'r': word = "" while word != "zzend": word = input("Enter a term to search for: ").lower() if stem == "Y": p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) found_word = False start = timer() for elem in post_list: if word == elem[0]: found_word = True print("\nThis term is found in " + str(len(elem[1])) + " documents.") print( "=============================================================================" ) break if found_word: print( "This search term is found in the following documents:\n") # output all docs that contain that term: DocID, title, TF, all the positions, first occurrence with 10 # words docdata = [] for entry in post_list: if entry[0] == word: docdata += entry[1] break # docdata now has doc ID, TF, and positions for each document input_txt appears in # now search in cacm for word data count = 0 get_title = False abstract_bool = False abstract_text = "" title = "" output = "" found = False for line in lines: if count == len(docdata): break if line.startswith(".I " + str(docdata[count][0])): found = True if line == ".B\n" and found: get_title = False abstract_bool = False found = False # I need to create the output string here, as its all going to be reset now. output += "Document " + str(docdata[count][0]) + " - " + title + "Term frequency: " + \ str(docdata[count][1]) + "\nList of positions: " + str(docdata[count][2]) + \ "\nFirst occurrence in document: " + \ getcontext(title + abstract_text, docdata[count][2][0]) + "\n" + "------------" + "\n" title = "" abstract_text = "" count += 1 if abstract_bool: abstract_text += line if line == ".W\n" and found: get_title = False abstract_bool = True if get_title: title += line if line == ".T\n" and found: get_title = True end = timer() elapsed_time = (end - start) if found_word: return_times += [elapsed_time] print(output) print("Search time: " + str(elapsed_time) + " seconds\n") # output time to results elif word != "zzend": print("Term not found in any documents") shutdown(return_times) g.close() h.close() else: print("Error opening file. Try again.")
def __init__(self): self.spec_chars_regex = re.compile('[^0-9a-zA-Z]') self.camel_case_regex_1 = re.compile('(.)([A-Z][a-z]+)') self.camel_case_regex_2 = re.compile('([a-z0-9])([A-Z])') self.stemmer = PorterStemmer() # from Gupta's Porter Stemmer
def __init__(self): self._stemmer = PorterStemmer()
def lookup(user_input, CLI, K): use_stem = False stop_words = False g = open("postings.txt", "r") f = open("cacm.all", "r") content = g.read().replace('\n', ' ') if content[0] == "1": use_stem = True if content[1] == "1": stop_words = True post_list = json.loads("[" + content[2:-2] + "]") lines = f.readlines() f.close() extracted_postings = [] docs = [] final_list = [] if g.mode == 'r': # get query og_query = user_input.lower() og_query = re.sub('[\-]+', ' ', og_query) og_query = re.sub('[^A-Za-z0-9$ ]+', '', og_query) newquery = og_query.split() if stop_words: temp = [] stop_words = open("stopwords.txt", "r").read().split('\n') for i in range(len(stop_words)): stop_words[i] = stop_words[i].lower() for word in newquery: if word not in stop_words: temp.append(word) newquery = temp if use_stem: stemmed_query = "" for word in newquery: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) stemmed_query += word + " " newquery = stemmed_query.split() newquery.sort() term_list = get_term_lists(newquery, post_list) # remove duplicates if they exist term_list = list(dict.fromkeys(term_list)) for entry in term_list: extracted_postings.append(post_list[entry]) # get docs out of extracted postings for posting in extracted_postings: for entry in posting[1]: docs.append(entry[0]) docs = list(dict.fromkeys(docs)) docs.sort() document_vectors = get_doc_vector(docs, lines, use_stem, stop_words) # print("Relevant document vectors created. Now calculating cosine similarity") # now, make all of those vectors have tf values, and then weights cosine_list = fill_vectors(document_vectors, og_query, docs) temp_list = [] for i in range(len(docs)): temp_list.append([docs[i], cosine_list[i]]) temp_list.sort(key=lambda x: x[1]) temp_list.reverse() if CLI: print("Query was: " + user_input + "\n") display(temp_list, get_doc_info(docs, lines)) for elem in temp_list: final_list.append(elem[0]) if K is None: return final_list else: return final_list[:K]
def __init__(self, PATH_TO_STOP_WORDS): print("[Tokenizer] Instantiated!") self.PATH_TO_STOP_WORDS = PATH_TO_STOP_WORDS self.STOP_WORDS = self.load_stopwords() self.PorterStemmer = PorterStemmer()
def processEmail(email_contents): #PROCESSEMAIL preprocesses a the body of an email and #returns a list of word_indices # word_indices = PROCESSEMAIL(email_contents) preprocesses # the body of an email and returns a list of indices of the # words contained in the email. # # Load Vocabulary vocab = getVocabDict() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find('\n\n') # email_contents = email_contents[hdrstart+2:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > # and does not have any < or > in the tag and replace it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print '\n==== Processed Email ====\n' # Process file l = 0 porterStemmer = PorterStemmer() # Tokenize and also get rid of any punctuation sep = '[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\},\'\"\>\_\<\;\%\n\r]+' for s in re.split(sep, email_contents): # Remove any non alphanumeric characters s = re.sub('[^a-zA-Z0-9]', '', s) # Stem the word s = porterStemmer.stem(s.strip()) # Skip the word if it is too short if len(s) < 1: continue # Look up the word in the dictionary and add to word_indices if # found # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable s. You should look up s in the # vocabulary dictionary (vocab). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if s = 'action', then you should # add to word_indices the value under the key 'action' # in vocab. For example, if vocab['action'] = 18, then, # you should add 18 to the word_indices vector # (e.g., word_indices.append(18) ). # # ============================================================= # Print to screen, ensuring that the output lines are not too long if l + len(s) + 1 > 78: print l = 0 print s, l += len(s) + 1 # Print footer print '\n=========================' return array(word_indices)