def __init__(self, fileA, fileB): self.__allWords = set() self.__wordsA = dict() self.__wordsB = dict() with open(fileA, 'r') as document: for line in document: words = line.strip().split() for word in words: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) if word in self.__wordsA.keys(): self.__wordsA[word] += 1 else: self.__wordsA[word] = 1 with open(fileB, 'r') as document: for line in document: words = line.strip().split() for word in words: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) if word in self.__wordsB.keys(): self.__wordsB[word] += 1 else: self.__wordsB[word] = 1 self.__allWords = set(self.__wordsA.keys()) | set(self.__wordsB.keys()) self.__table = {t[1]: t[0] for t in enumerate(self.__allWords)}
def __init__(self,name,task_queue,result_queue): self.name = name self.r_stemmer = PorterStemmer() self.queue = task_queue self.queue2 = result_queue #载入停词表 pre_worker.load_stopwords()
def stemizeQuery(query): # STEMIZER FOR QUERY porter = PorterStemmer() newList = [] for q in query: newList.append(porter.stem(q,0,len(q)-1)) return newList
def __init__(self, path): self.stemmer = PorterStemmer() self.path = path self.cur_idx = 0 self.batch = 2 self.sample = 0.001 self.vocab_size = 10000 self.total_count = 0 self.word_count = Counter() self.word2idx = defaultdict(int) self.idx2word = {} self.word_sample = {} self.batch_size = 128 self.embedding_size = 128 # Dimension of the embedding vector. self.skip_window = 3 # How many words to consider left and right. self.raw_sample_probs = [0.5, 0.3, 0.2] self.sample_probs = [] sum = 0 for prob in self.raw_sample_probs: sum += prob self.sample_probs.append(sum) self.num_skips = 2 # How many times to reuse an input to generate a label. self.valid_size = 16 # Random set of words to evaluate similarity on. self.valid_window = 100 # Only pick dev samples in the head of the distribution. #self.valid_examples = np.random.choice(valid_window, valid_size, replace=False) self.num_negative_sampled = 64 # Number of negative examples to sample. self.batch_index = 0 self.get_stat()
def stemize(dictList): #STEMIZER FOR DICTIONARY porter = PorterStemmer() for dict in dictList: for token in dict.keys(): dict[porter.stem(token,0,len(token)-1)] = dict.pop(token) return dictList
def tokenize(sText, pairing=False): '''Given a string of text sText, returns a list of the individual stemmed tokens that occur in that string (in order). This is my quick and dirty Tokenizer. Satisfaction Not Guarenteed''' import string from stemmer import PorterStemmer sText = sText.lower() sText = re.sub("’", "'", sText) sText = re.sub("&.{0,6};", " ", sText) sText = re.sub("[\x80-\xff]", "", sText) sText = sText.split(None) for p in string.punctuation.replace("'", ""): try: sText = mapAndFold(lambda x: x.split(p), sText) except TypeError: # empty string return [] sText = mapAndFold(lambda x: x.split(), sText) sText = map(lambda x: x.strip("\'"), sText) sText = map(lambda x: x.strip("\""), sText) sText = map(lambda x: x.strip("_"), sText) sText = filter(lambda x: not re.match("\d+", x), sText) sText = filter(lambda x: not x == "", sText) sText = filter(lambda x: not x[0] == "@", sText) stemmer = PorterStemmer() if pairing: #return original with token val in tuple return [(w, stemmer.stem(w, 0, len(w) - 1)) for w in sText] return [stemmer.stem(w, 0, len(w) - 1) for w in sText]
def GetDataset(): emails = None x_vals = [] y_vals = [] stemmer = PorterStemmer() with open("pickled_reduced_chains.txt", "rb") as fp1: # Unpickling emails = pickle.load(fp1) i = 0 text_data = [] for i in range(0, len(emails)): print "Evaluation Email %d" % (i) email, next_email, time_diff = emails[i] print emails[i] # Create feature array features = [] # if np.round(time_diff / 60) > 72: continue; #Feature 1: Number of to features.append(len(email['to'])) # Feature 2: Num words words = email['body'].split() features.append(len(words)) # Feature 3: Number of CC features.append(email['cc_count']) # Feature 4: is reply if email['is_re']: features.append(1) else: features.append(0) # Feature 5: Time of Day (minutes) date = email['date']['local_date'] hour = date.hour features.append(hour) # Feature 6: Length of Subject Line subject_words = email['subject'].split() features.append(len(subject_words)) # Feature 7: Day of Week features.append(date.weekday()) # Feature 8: Question marks in Body features.append(email['body'].count('?')) # Feature 9: Question marksin Subject features.append(email['subject'].count('?')) x_vals.append(features) # Append y_value for training point y_vals.append(int(np.round(time_diff / 60))) a = np.array(x_vals) b = np.array(y_vals) return a, b
def stemWords(listTokens): s = PorterStemmer() stemmedTerms = [] for x in listTokens: stemmedTerms.append(s.stem(x, 0, len(x) - 1)) return stemmedTerms
def stemWords(inList): outList = [] ps = PorterStemmer() for token in inList: stemmed_token = ps.stem(token, 0, len(token) - 1) outList.append(stemmed_token) return outList
def stemWords(tokens): """Function that stems the words. """ # use porter stemmer # https://tartarus.org/martin/PorterStemmer/python.txt p = PorterStemmer() for index, word in enumerate(tokens): tokens[index] = p.stem(word, 0, len(word) - 1) return tokens
def stem(word): # word needs to be all lowercase before being passed to stem string.lower(word) # fancy stuff to remove .,?!" mymatch = re.compile('(\,|\.|\!|\?|\")') word = mymatch.sub(r'',word) p = PorterStemmer() word = p.stem(word, 0,len(word)-1) return word
def main(): array = [] array2 = [] p = PorterStemmer() with open(sys.argv[1]) as f: for line in f: if len(line) > 1: array.append(line[0:len(line) - 1]) word_dictionary, tag_dictionary, count = read_input(array, p) with open(sys.argv[2]) as f: for line in f: if len(line) > 1: array2.append(line[0:len(line) - 1]) read_test_data(array2, word_dictionary, tag_dictionary, count)
def GetWordDictionary(emails): word_dict = {} count = 0 stemmer = PorterStemmer() for email_case in emails: email = email_case[0] body = SplitText(email['body']) for word in body: modified_word = word if len(modified_word) > 1: modified_word = stemmer.stem(word, 0, len(word) - 1) if modified_word not in word_dict: word_dict[modified_word] = count count += 1 return word_dict
def main(): # Reading the document from the file fileName = "cran.all.1400" documents = readFromFile(fileName, "r") # Reading stop words from the file stopwordsList = readFromFile("stopwords.txt", "r") stopwords = stopwordsList.split() # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency. docId = 1 # InvFileHash invFileHash = {} # Splits the multiple documents of the same file into list document = re.split(".I | \n.I", documents)[1:] totalDocument = len(document) print "Total documents:", totalDocument for doc in enumerate(document): startIndex = doc[1].index('.W\n') text = doc[1][startIndex + 3:] words = re.findall(r'\w+', text) pObj = PorterStemmer() listWords = {} for word in words: flagStopwords = word.lower() in stopwords if (not flagStopwords and word.isalpha()): stemWord = pObj.stem(word.lower(), 0, len(word) - 1) listWords = addToDict(listWords, stemWord) docList = addDoc(docId, listWords) docId += 1 invFileHash = createInvFileHash(invFileHash, docList) # Writes to the Inverted File Hash file writeToFile(invFileHash) # To read the queries list from the cran query file queryFileRead = readFromFile("cran.qry", "r") # Calculate the Vector Space Model (total number of documents, stopwords list) vectorSpaceModel(totalDocument, queryFileRead, stopwords)
def classify(folds, nb_or_svm, ngrams, stemming, binary): p = PorterStemmer() vectorizer = CountVectorizer(input="filename", \ ngram_range=ngrams, \ tokenizer=(lambda d: [(p.stem(t, 0, len(t)-1) if stemming else t) for t in d.split()]), \ binary=binary, \ min_df=4, max_df=1.0) X = vectorizer.fit_transform([f[0] for fold in folds for f in fold]) accuracies = [] for i in range(len(folds)): classifier = SVC(gamma="auto", kernel="linear") if nb_or_svm[0] == "svm" \ else MultinomialNB(alpha=(1.0 if nb_or_svm[1] else 1.0e-10)) start_index = 0 for j in range(i): start_index += len(folds[j]) end_index = start_index + len(folds[i]) test_set = X[start_index:end_index] training_set = vstack([X[:start_index], X[end_index:]]) classifier.fit( training_set, [f[1] for fold in (folds[:i] + folds[i + 1:]) for f in fold]) correct_predictions = 0 results = classifier.predict(test_set) for j in range(len(results)): correct_predictions += int(results[j] == folds[i][j][1]) accuracies.append(100 * correct_predictions / len(results)) if nb_or_svm[0] != "svm": print("smoothed" if nb_or_svm[1] else "unsmoothed", end=" ") print("stemmed" if stemming else "unstemmed", \ "presence" if binary else "frequency", \ "unigrams" if ngrams == (1, 1) else \ ("bigrams" if ngrams == (2, 2) else \ ("uni + bi" if ngrams == (1, 2) else "unknown")), \ "accuracy:", sum(accuracies)/len(accuracies))
def stemizeList(normalList): # STEMIZER FOR LIST porter = PorterStemmer() newList = [] newDict = {} count = 0; for lists in normalList: tokenList = [] for token in lists: #print normalList.index(lists)," ",lists.index(token) tokenList.append(porter.stem(token,0,len(token)-1)) if token in newDict: count = newDict[token] newDict[token] = count +1 else: newDict[token] = 1 newList.append(tokenList) #token = porter.stem(token,0,len(token)-1) return newList,newDict
def get_postlist(stop_answer, stem_answer, dict_terms): if stop_answer == 'no': stopwords = [] if stop_answer == 'yes': stopwords = [ 'i', 'a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'what', 'when', 'where', 'who', 'will', 'with', 'the' ] ps = PorterStemmer() position_list = [] dict_posting = {} counter = 0 for key, value in dict_terms.items(): if isinstance(value, dict): for k, v in value.items(): if k == 'abstract': val = v.replace(',', '').lower().split() for index, word in enumerate(val): if stem_answer == 'no': stem_word = word if stem_answer == 'yes': stem_word = ps.stem(word, 0, len(word) - 1) if stem_word not in stopwords: if stem_word not in dict_posting: dict_posting[stem_word] = {} if key not in dict_posting[stem_word]: dict_posting[stem_word][key] = {} dict_posting[stem_word][key][ 'frequency'] = 0 dict_posting[stem_word][key][ 'position'] = [] dict_posting[stem_word][key]['frequency'] += 1 dict_posting[stem_word][key][ 'position'].append(index) with open('posting_list.json', 'w') as outfile: json.dump(dict_posting, outfile) print("Finished writing the posting list") return dict_posting
def GetTFIDF(): emails = None x_vals = [] y_vals = [] stemmer = PorterStemmer() # Get email chains with open("balanced_chains.pickle", "rb") as fp1: # Unpickling emails = pickle.load(fp1) np.random.shuffle(emails) i = 0 text_data = [] for i in range(0, len(emails)): print "Evaluation Email %d" % (i) email, next_email, time_diff, bucket = emails[i] # if int(np.round(time_diff / 60)) > 72: # continue # Create stemmed body and append to text_data new_str = "" words = email['body'].split() for word in words: new_word = stemmer.stem(word, 0, len(word) - 1) new_str += new_word + " " new_str = new_str[:-1] text_data.append(new_str) # Append hour y_vals.append(int(np.round(time_diff / 60))) #y_vals.append(int(time_diff) b = np.array(y_vals) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(text_data) tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) return X_train_tf, b, count_vect, tf_transformer, text_data
def process(text): '''Returns a list of words after carying out the following text preprocessing and normalization steps''' # Convert text to lower case text = text.lower() #Remove 'Subject' text = re.sub(r'^sub(ject)?', ' ', text) # Strip HTML text = re.sub(r'<.*?>', ' ', text) # Normalize URLs text = re.sub(r'(http|https|ftp)://\S*', ' httpaddr ', text) # Normalize email addresses text = re.sub(r'[\w.+-]+@[\w.-]+', ' emailaddr ', text) # Normalize numbers text = re.sub(r'\b\d[\d,]*[.]*[\d]*\b', ' number ', text) # Normalize Dollars/Rupees text = re.sub(r'(\$|\brs\b|₹|£)+', ' dollar ', text) # Remove non-word characters text = re.sub(r'[^a-z]+', ' ', text) # Strip all whitespace characters and generate list of words # Stop Word Removal # stop_words = pickle.load(open('stopwords_set.pyset', 'rb')) text = [ word for word in text.split() if word not in process.stop_words and len(word) > 2 ] # Word Stemming p = PorterStemmer() result = [] for word in text: try: stem_word = p.stem(word, 0, len(word) - 1) if stem_word not in process.stop_words: result.append(stem_word) except: pass return result
def main(): # Reading the document from the file file = open("cran.all.1400", "r") documents = file.read() # Reading stop words from the file fileStopwords = open('stopwords.txt', 'r') stopwordsList = fileStopwords.read() stopwords = stopwordsList.split() # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency. documentList = [] docId = 1 # Splits the multiple documents of the same file into list document = re.split(".I | \n.I", documents)[1:] for doc in enumerate(document): startIndex = doc[1].index('.W\n') text = doc[1][startIndex + 3:] words = re.findall(r'\w+', text) pObj = PorterStemmer() listWords = {} for word in words: flagStopwords = word.lower() in stopwords if (not flagStopwords and word.isalpha()): stemWord = pObj.stem(word, 0, len(word) - 1) listWords = addToDict(listWords, stemWord) sortedList = sorted(listWords.items(), key=lambda t: t[0]) output = {'id': docId, 'unique': len(sortedList), 'terms': sortedList} docId += 1 documentList.append(output) for i in range(0, len(documentList)): print "Document:", documentList[i][ 'id'], "\nUnique Terms:", documentList[i][ 'unique'], "\nTerms:\n", documentList[i]['terms']
def main(): # Reading the document from the file file = open("cran.all.1400", "r") documents = file.read() # Reading stop words from the file fileStopwords = open('stopwords.txt', 'r') stopwordsList = fileStopwords.read() stopwords = stopwordsList.split() # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency. docId = 1 #InvFileHash invFileHash = {} # Splits the multiple documents of the same file into list document = re.split(".I | \n.I", documents)[1:] for doc in enumerate(document): startIndex = doc[1].index('.W\n') text = doc[1][startIndex + 3:] words = re.findall(r'\w+', text) pObj = PorterStemmer() listWords = {} for word in words: flagStopwords = word.lower() in stopwords if (not flagStopwords and word.isalpha()): stemWord = pObj.stem(word, 0, len(word) - 1) listWords = addToDict(listWords, stemWord) docList = addDoc(docId, listWords) docId += 1 invFileHash = createInvFileHash(invFileHash, docList) print "File written: output.json" print "Number of terms:",len(invFileHash) writeToFile(invFileHash)
def vectorSpaceModel(totalDocument, queryFileRead, stopwords): """ Query to calculate the cosine similarity between document d and Query Q """ # Loads the inverted File Hash dic = loadFromFile() # queryList = processQueryList(queryFileRead) # Calculation of Inverse Document Frequency IDF = calculateIDF(dic, totalDocument) # Calculation of Term Frequency TF = calculateTFList(dic) # Calculation of Wd from all the Term Frequency calculated WD = calculateWD(TF, totalDocument) pObj = PorterStemmer() fileWrite = open("outputdocument.txt", "w") for query in queryList: fileWrite.write("\n---------------------------------------------------------------------------------------") fileWrite.write("\nQuery: " + query) # Separate the string of query into list of words listQuery = re.findall(r'\w+', query) # Remove the stopwords and numbers from the list of query words queryWithoutStopword = [x for x in listQuery if x not in stopwords and x.isalpha()] # Stem the list of query words processedQuery = [pObj.stem(x.lower(), 0, len(x) - 1) for x in queryWithoutStopword] # Calculate the cosine measure (Similarity) for the query rankedDocList = calculateSimilarity(processedQuery, IDF, WD, totalDocument) fileWrite.write("\nTotal number of documents retrieved: " + str(len(rankedDocList))) fileWrite.write("\nDocument ID:\n") fileWrite.write(''.join(str(rankedDocList))) fileWrite.write("\n---------------------------------------------------------------------------------------") fileWrite.close() print "Writing to outputdocument.txt file completes."
def buildMatrix(self): # use suffix-stripping algorithm to stem word porter_strmmer = PorterStemmer() for index in range(0,len(self.origin_documents)): document = self.origin_documents[index] # change document in origin_document array to array of stemmed word self.origin_documents[index] = [porter_strmmer.stem(x, 0, len(x) - 1) for x in document.split()] # use 2000 most frequent words to generate words array temp_word = defaultdict(int) for document in self.origin_documents: for word in document: temp_word[word] += 1 sorted_dict = sorted(temp_word.items(), key=operator.itemgetter(1)) sorted_dict.reverse() self.words = [x[0] for x in sorted_dict[0:self.word_size]] # build document array for index in range(0, len(self.origin_documents)): document = self.origin_documents[index] self.documents.append([]) self.documents[index] = [document.count(word) for word in self.words] # print(self.documents[0], sum(self.documents[0])) # remove zero sum rows zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0] for value in zeros[::-1]: del self.labels[value] del self.documents[value] # zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0] print(len(self.origin_documents), len(self.words), len(self.documents), self.words)
def stemWords(input): stem = PorterStemmer() for index,entries in enumerate(input): input[index]=stem.stem(entries,0,len(entries)-1) return input
def GetDataset(): np.random.shuffle(emails) x_vals = [] y_vals = [] stemmer = PorterStemmer() word_mapping = GetWordDictionary(emails) i = 0 text_data = [] for i in range(0, len(emails)): #print "Evaluation Email %d" % (i) # note: time diff in mins email, next_email, time_diff, label = emails[i] # Create feature array features = [] #Feature 1: Number of to features.append(len(email['to'])) # Feature 2: Num words words = email['body'].split() lower_case_body = [ stemmer.stem(x.lower(), 0, len(x) - 1) for x in words ] features.append(len(words)) # Feature 3: Number of CC features.append(email['cc_count']) # Feature 4: is reply if email['is_re']: features.append(1) else: features.append(0) # Feature 5: Time of Day (hour) date = email['date']['local_date'] hour = date.hour features.append(hour) # Feature 6: Length of Subject Line subject_words = email['subject'].split() lower_case_subject = [ stemmer.stem(x.lower(), 0, len(x) - 1) for x in subject_words ] features.append(len(subject_words)) # Feature 7: Day of Week features.append(date.weekday()) # Feature 8: # Question marks in Body, bool in body features.append(email['body'].count('?')) # Feature 9: # Question marks in Subject, bool in subject features.append(email['subject'].count('?')) # NEW FEATURES # boolean: presence of ? in body / header features.append(1 if '?' in email['body'] else 0) features.append(1 if '?' in email['subject'] else 0) # Feature 12-13: "RESPONSE NEEDED" in subject or body keywords = ['response', 'please', 'can', 'urgent', 'important', 'need'] for keyword in keywords: stemmed_keyword = stemmer.stem(keyword, 0, len(keyword) - 1) features.append(1 if stemmed_keyword in lower_case_subject else 0) features.append(1 if stemmed_keyword in lower_case_body else 0) x_vals.append(features) y_vals.append(label) X = np.array(x_vals) Y = np.array(y_vals) return X, Y
iter_ = regexObject.finditer(text) for result in iter_: if result.group('id') is not None: docs.append(Doc(int(result.group('id')))) # add new doc to the `list` with the id else: # as far as we know, not both of these can be `not none` at the same time if result.group('title') is not None: docs[-1].appendText(result.group('title').lower() + '\n') # append to the last element docs `list` if result.group('body') is not None: docs[-1].appendText(result.group('body').lower()) # append to the last element docs `list` return docs stopWords = {'a', 'all', 'an', 'and', 'any', 'are', 'as', 'be', 'been', 'but', 'by ', 'few', 'for', 'have', 'he', 'her', 'here', 'him', 'his', 'how', 'i', 'in', 'is', 'it', 'its', 'many', 'me', 'my', 'none', 'of', 'on ', 'or', 'our', 'she', 'some', 'the', 'their', 'them', 'there', 'they', 'that ', 'this', 'us', 'was', 'what', 'when', 'where', 'which', 'who', 'why', 'will', 'with', 'you', 'your'} portertStemmer = PorterStemmer() ro= re.compile(r'\d+\.+\d+|\w+') """ regex object for finding words and digits and floats containing dot IN them pre-cooked ready to eat""" def regexStyleGenerator(text): """ A generator that returns tokens of given text makes stopword removel and stemming :param text: the text to tokenize :return: None but yields tokens """ global total_num_words_before,total_num_words_after # for statistics
DATASETS = "datasets" TRAINING = DATASETS + "/training.txt" VALIDATION = DATASETS + "/validation.txt" SPORTS = 2 POLITICS = 1 HASHTAGS = re.compile("\#([^\s\,\.\#\"\'\+\=\|\$\%\^\:]+)") URLS1 = re.compile("https?\:\/\/([^\s]+)") URLS2 = re.compile("www\.([^\s]+)") REFS = re.compile("\@([^\s\,\.\#\"\'\+\=\|\$\%\^\:\-]+)") KEYWORDS = re.compile("(\w+)") WHITESPACE = re.compile( "[\s\.\,\'\"\[\]\{\}\;\:\/\&\=\+\-\)\(\*\&\^\%\$\`\|\?\!]+") STEMMER = PorterStemmer() idtable = {} def get_id(table, key, write=True): if table.has_key(key): return table[key] else: if write: table[key] = len(table) return table[key] else: return None def register(table, keys, write=True):
def run_train_test(training_file, testing_file): # Set the variables, params, dicts, sets alpha = 0.5 stop_words = {'the', 'and'} logic_negation = {'t', 'not', 'no', 'never', 'dont', 'didnt', 'doesnt'} Porter_Stemmer = PorterStemmer() # Import training dataset training_start_time = time.time() vocab = set(['positive-words', 'negative-words']) wordcount_class_0 = {'positive-words': 0, 'negative-words': 0} wordcount_class_1 = {'positive-words': 0, 'negative-words': 0} total_reviews = 0 reviewscount_0 = 0 reviewscount_1 = 0 train_labels = [] train_reviews = [] with training_file as f: for line in f: review, label = line.split(',') words = review.split(' ') del words[-1] label = int(label.strip("\n")) total_reviews += 1 # Implement negation: add NOT_ to words after logical negation for i in range(len(words)): words[i] = Porter_Stemmer.stem(words[i]) if words[i] in logic_negation: try: words[i + 1] = 'NOT_' + words[i + 1] except: continue try: words[i + 2] = 'NOT_' + words[i + 2] except: continue try: words[i + 3] = 'NOT_' + words[i + 3] except: continue bigrams = [] for i in range(len(words) - 1): bigram = words[i] + ' ' + words[i + 1] bigrams.append(bigram) words = set(bigrams) # words = set(words) vocab.update(words) for word in words: if word not in wordcount_class_0.keys(): wordcount_class_0[word] = 0 wordcount_class_1[word] = 0 if label == 0: reviewscount_0 += 1 for word in words: wordcount_class_0[word] += 1 # # Analyze Sentiment lexicons # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram1 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 # if unigram2 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram2 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 if label == 1: reviewscount_1 += 1 for word in words: wordcount_class_1[word] += 1 # # Analyze Sentiment lexicons # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram1 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 # if unigram2 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram2 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 train_labels.append(label) train_reviews.append(words) # Compute CPTs P_class = [0, 0] P_class[0] = reviewscount_0 / total_reviews P_class[1] = reviewscount_1 / total_reviews P_words_class_0 = {} P_words_class_1 = {} bottom_0 = sum(wordcount_class_0.values()) + alpha * len(vocab) bottom_1 = sum(wordcount_class_1.values()) + alpha * len(vocab) for word in vocab: if word in stop_words: P_words_class_0[word] = (0 + alpha) / bottom_0 P_words_class_1[word] = (0 + alpha) / bottom_1 else: P_words_class_0[word] = (wordcount_class_0[word] + alpha) / bottom_0 P_words_class_1[word] = (wordcount_class_1[word] + alpha) / bottom_1 # Inference on the training dataset predict_train_labels = [] for doc in train_reviews: log_sum_0 = 0 log_sum_1 = 0 bag_of_words = set(doc) for word in bag_of_words: log_sum_0 += log(P_words_class_0[word]) log_sum_1 += log(P_words_class_1[word]) # # Sentiment Analysis # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram1 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) # if unigram2 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram2 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) Prob_c0 = log(P_class[0]) + log_sum_0 Prob_c1 = log(P_class[1]) + log_sum_1 if Prob_c0 > Prob_c1: c = 0 else: c = 1 predict_train_labels.append(c) # Compute training accuracy correct = 0 for i in range(len(train_labels)): if predict_train_labels[i] == train_labels[i]: correct += 1 train_accuracy = correct / len(train_labels) training_time = time.time() - training_start_time # Import testing dataset testing_start_time = time.time() test_reviews = [] test_labels = [] with testing_file as f: for line in f: review, label = line.split(',') words = review.split(' ') del words[-1] label = int(label.strip("\n")) # Implement negation: add NOT_ to words after logical negation for i in range(len(words)): words[i] = Porter_Stemmer.stem(words[i]) if words[i] in logic_negation: try: words[i + 1] = 'NOT_' + words[i + 1] except: continue try: words[i + 2] = 'NOT_' + words[i + 2] except: continue try: words[i + 3] = 'NOT_' + words[i + 3] except: continue bigrams = [] for i in range(len(words) - 1): bigram = words[i] + ' ' + words[i + 1] bigrams.append(bigram) words = set(bigrams) # words = set(words) test_labels.append(label) test_reviews.append(words) # Inference on the testing dataset predict_test_labels = [] for doc in test_reviews: log_sum_0 = 0 log_sum_1 = 0 bag_of_words = set(doc) bag_of_words = vocab.intersection(bag_of_words) for word in bag_of_words: log_sum_0 += log(P_words_class_0[word]) log_sum_1 += log(P_words_class_1[word]) # # Sentiment Analysis # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram1 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) # if unigram2 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram2 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) Prob_c0 = log(P_class[0]) + log_sum_0 Prob_c1 = log(P_class[1]) + log_sum_1 if Prob_c0 > Prob_c1: c = 0 else: c = 1 # print(c) predict_test_labels.append(c) # Compute testing accuracy correct = 0 for i in range(len(test_labels)): if predict_test_labels[i] == test_labels[i]: correct += 1 test_accuracy = correct / len(test_labels) # Print results testing_time = time.time() - testing_start_time print(round(training_time), 'seconds (training)') print(round(testing_time), 'seconds (labeling)') print(round(train_accuracy, 3), '(training)') print(round(test_accuracy, 3), '(testing)') print(len(vocab)) return
def stemming(self, term): output = "" stem1 = PorterStemmer() output = stem1.stem(term) return output
def preprocessing(file_string): try: f = open(file_string) email_contents = f.read() # # #2 去标题 # cut = re.search(r'\n[ \t]*\n',email_contents).span()[1] - 1 # \n...\n # cut ↑ email_contents = email_contents[cut:] # # #3 其他预处理 # #小写 √ email_contents = email_contents.lower() #删HTML标签 √ # email_contents = re.sub(r'<[^<>]+>',' ',email_contents); # URL链接替换 → httpaddr √ # http//https :// xx.xxx.xxx email_contents = re.sub(r'http\:\/\/[\w\.\/]+|https\:\/\/[\w\.\/]+','httpaddr',email_contents) # # URL邮件地址替换 → emailaddr # email_contents = re.sub(r'[\w\-\_]+\@[\w]+\.[\w]+','emailaddr',email_contents) # 数字替换 → number √ # 整数 小数 email_contents = re.sub(r'(\d+|\d+\.\d+)','number',email_contents) # 美元替换 $ → dollar √ # email_contents = re.sub(r'\$','dollar',email_contents) # 单词词干化 # 实现复杂 # 去除非单词和标点 # email_contents = re.sub(r'[\W]',' ',email_contents) # 删去单字母 for i in range(50): email_contents = re.sub(r' [a-z] ',' ',email_contents) # 删去多余空格 email_contents = re.sub(r'[\t\n ]+',' ',email_contents) # # from stemmer import PorterStemmer # stemmer = PorterStemmer() # from stemmer import PorterStemmer stemmer = PorterStemmer() email_contents = stemmer.stem(email_contents) # word_list = re.findall(r'\w+',email_contents) # return word_list for word in word_list: if(word_frequency.get(word,"None") == "None"): word_frequency[word] = 1; else: word_frequency[word] += 1; # return word_frequency except: print (file_string + " ERROR") pass