def __init__(self, path): self.stemmer = PorterStemmer() self.path = path self.cur_idx = 0 self.batch = 2 self.sample = 0.001 self.vocab_size = 10000 self.total_count = 0 self.word_count = Counter() self.word2idx = defaultdict(int) self.idx2word = {} self.word_sample = {} self.batch_size = 128 self.embedding_size = 128 # Dimension of the embedding vector. self.skip_window = 3 # How many words to consider left and right. self.raw_sample_probs = [0.5, 0.3, 0.2] self.sample_probs = [] sum = 0 for prob in self.raw_sample_probs: sum += prob self.sample_probs.append(sum) self.num_skips = 2 # How many times to reuse an input to generate a label. self.valid_size = 16 # Random set of words to evaluate similarity on. self.valid_window = 100 # Only pick dev samples in the head of the distribution. #self.valid_examples = np.random.choice(valid_window, valid_size, replace=False) self.num_negative_sampled = 64 # Number of negative examples to sample. self.batch_index = 0 self.get_stat()
def stemize(dictList): #STEMIZER FOR DICTIONARY porter = PorterStemmer() for dict in dictList: for token in dict.keys(): dict[porter.stem(token,0,len(token)-1)] = dict.pop(token) return dictList
def __init__(self,name,task_queue,result_queue): self.name = name self.r_stemmer = PorterStemmer() self.queue = task_queue self.queue2 = result_queue #载入停词表 pre_worker.load_stopwords()
def tokenize(sText, pairing=False): '''Given a string of text sText, returns a list of the individual stemmed tokens that occur in that string (in order). This is my quick and dirty Tokenizer. Satisfaction Not Guarenteed''' import string from stemmer import PorterStemmer sText = sText.lower() sText = re.sub("’", "'", sText) sText = re.sub("&.{0,6};", " ", sText) sText = re.sub("[\x80-\xff]", "", sText) sText = sText.split(None) for p in string.punctuation.replace("'", ""): try: sText = mapAndFold(lambda x: x.split(p), sText) except TypeError: # empty string return [] sText = mapAndFold(lambda x: x.split(), sText) sText = map(lambda x: x.strip("\'"), sText) sText = map(lambda x: x.strip("\""), sText) sText = map(lambda x: x.strip("_"), sText) sText = filter(lambda x: not re.match("\d+", x), sText) sText = filter(lambda x: not x == "", sText) sText = filter(lambda x: not x[0] == "@", sText) stemmer = PorterStemmer() if pairing: #return original with token val in tuple return [(w, stemmer.stem(w, 0, len(w) - 1)) for w in sText] return [stemmer.stem(w, 0, len(w) - 1) for w in sText]
def ReadInfoFile(infoFile): global text2pddl; fin = open(infoFile) lines = fin.readlines() fin.close() lines = [ line.strip() for line in lines ]; unitDict = {} p = PorterStemmer() for line in lines: if len(line)==0: continue; parts = line.split(':'); textName = parts[0]; words = [ p.stem(w.lower(),0,len(w)-1) for w in textName.split() ] textName = ' '.join(words); if len(parts)>1: pddlName = parts[1]; else: pddlName = parts[0]; assert(pddlName!=''); assert(textName!=''); for word in words: unitDict[word] = True #print unitDict.keys() return unitDict
def stemizeQuery(query): # STEMIZER FOR QUERY porter = PorterStemmer() newList = [] for q in query: newList.append(porter.stem(q,0,len(q)-1)) return newList
def tokenize(sText, pairing = False): '''Given a string of text sText, returns a list of the individual stemmed tokens that occur in that string (in order). This is my quick and dirty Tokenizer. Satisfaction Not Guarenteed''' import string from stemmer import PorterStemmer sText = sText.lower() sText = re.sub("’", "'", sText) sText = re.sub("&.{0,6};", " ", sText) sText = re.sub("[\x80-\xff]", "", sText) sText = sText.split(None) for p in string.punctuation.replace("'", ""): try: sText = mapAndFold(lambda x: x.split(p), sText) except TypeError: # empty string return [] sText = mapAndFold(lambda x: x.split(), sText) sText = map(lambda x: x.strip("\'"), sText) sText = map(lambda x: x.strip("\""), sText) sText = map(lambda x: x.strip("_"), sText) sText = filter(lambda x: not re.match("\d+", x), sText) sText = filter(lambda x: not x == "", sText) sText = filter(lambda x: not x[0] == "@", sText) stemmer = PorterStemmer() if pairing: #return original with token val in tuple return [(w,stemmer.stem(w, 0, len(w)-1)) for w in sText] return [stemmer.stem(w, 0, len(w)-1) for w in sText]
def stemWords(listTokens): s = PorterStemmer() stemmedTerms = [] for x in listTokens: stemmedTerms.append(s.stem(x, 0, len(x) - 1)) return stemmedTerms
def stemWords(inList): outList = [] ps = PorterStemmer() for token in inList: stemmed_token = ps.stem(token, 0, len(token) - 1) outList.append(stemmed_token) return outList
def stem_phrase(phrase): words = phrase.lower().replace('.', '').replace("'", '').split() # ignore stop words words = [word for word in words if not word in STOP_WORDS] p = PorterStemmer() return [p.stem(word, 0, len(word)-1) for word in words]
def stemWords(tokens): """Function that stems the words. """ # use porter stemmer # https://tartarus.org/martin/PorterStemmer/python.txt p = PorterStemmer() for index, word in enumerate(tokens): tokens[index] = p.stem(word, 0, len(word) - 1) return tokens
def stem(word): # word needs to be all lowercase before being passed to stem string.lower(word) # fancy stuff to remove .,?!" mymatch = re.compile('(\,|\.|\!|\?|\")') word = mymatch.sub(r'',word) p = PorterStemmer() word = p.stem(word, 0,len(word)-1) return word
def splitToken(token,isStem=True): toks = token.split('_') word = toks[0].lower() tag = toks[1] if not word.isalnum(): tag = 'PUNC' if isStem: # simple post stem p = PorterStemmer() #word = p.stem1(word,0,len(word)-1) word = p.stem(word,0,len(word)-1) return (word, tag)
def GetWordDictionary(emails): word_dict = {} count = 0 stemmer = PorterStemmer() for email_case in emails: email = email_case[0] body = SplitText(email['body']) for word in body: modified_word = word if len(modified_word) > 1: modified_word = stemmer.stem(word, 0, len(word) - 1) if modified_word not in word_dict: word_dict[modified_word] = count count += 1 return word_dict
def GetDataset(): emails = None x_vals = [] y_vals = [] stemmer = PorterStemmer() with open("pickled_reduced_chains.txt", "rb") as fp1: # Unpickling emails = pickle.load(fp1) i = 0 text_data = [] for i in range(0, len(emails)): print "Evaluation Email %d" % (i) email, next_email, time_diff = emails[i] print emails[i] # Create feature array features = [] # if np.round(time_diff / 60) > 72: continue; #Feature 1: Number of to features.append(len(email['to'])) # Feature 2: Num words words = email['body'].split() features.append(len(words)) # Feature 3: Number of CC features.append(email['cc_count']) # Feature 4: is reply if email['is_re']: features.append(1) else: features.append(0) # Feature 5: Time of Day (minutes) date = email['date']['local_date'] hour = date.hour features.append(hour) # Feature 6: Length of Subject Line subject_words = email['subject'].split() features.append(len(subject_words)) # Feature 7: Day of Week features.append(date.weekday()) # Feature 8: Question marks in Body features.append(email['body'].count('?')) # Feature 9: Question marksin Subject features.append(email['subject'].count('?')) x_vals.append(features) # Append y_value for training point y_vals.append(int(np.round(time_diff / 60))) a = np.array(x_vals) b = np.array(y_vals) return a, b
def main(): # Reading the document from the file fileName = "cran.all.1400" documents = readFromFile(fileName, "r") # Reading stop words from the file stopwordsList = readFromFile("stopwords.txt", "r") stopwords = stopwordsList.split() # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency. docId = 1 # InvFileHash invFileHash = {} # Splits the multiple documents of the same file into list document = re.split(".I | \n.I", documents)[1:] totalDocument = len(document) print "Total documents:", totalDocument for doc in enumerate(document): startIndex = doc[1].index('.W\n') text = doc[1][startIndex + 3:] words = re.findall(r'\w+', text) pObj = PorterStemmer() listWords = {} for word in words: flagStopwords = word.lower() in stopwords if (not flagStopwords and word.isalpha()): stemWord = pObj.stem(word.lower(), 0, len(word) - 1) listWords = addToDict(listWords, stemWord) docList = addDoc(docId, listWords) docId += 1 invFileHash = createInvFileHash(invFileHash, docList) # Writes to the Inverted File Hash file writeToFile(invFileHash) # To read the queries list from the cran query file queryFileRead = readFromFile("cran.qry", "r") # Calculate the Vector Space Model (total number of documents, stopwords list) vectorSpaceModel(totalDocument, queryFileRead, stopwords)
def classify(folds, nb_or_svm, ngrams, stemming, binary): p = PorterStemmer() vectorizer = CountVectorizer(input="filename", \ ngram_range=ngrams, \ tokenizer=(lambda d: [(p.stem(t, 0, len(t)-1) if stemming else t) for t in d.split()]), \ binary=binary, \ min_df=4, max_df=1.0) X = vectorizer.fit_transform([f[0] for fold in folds for f in fold]) accuracies = [] for i in range(len(folds)): classifier = SVC(gamma="auto", kernel="linear") if nb_or_svm[0] == "svm" \ else MultinomialNB(alpha=(1.0 if nb_or_svm[1] else 1.0e-10)) start_index = 0 for j in range(i): start_index += len(folds[j]) end_index = start_index + len(folds[i]) test_set = X[start_index:end_index] training_set = vstack([X[:start_index], X[end_index:]]) classifier.fit( training_set, [f[1] for fold in (folds[:i] + folds[i + 1:]) for f in fold]) correct_predictions = 0 results = classifier.predict(test_set) for j in range(len(results)): correct_predictions += int(results[j] == folds[i][j][1]) accuracies.append(100 * correct_predictions / len(results)) if nb_or_svm[0] != "svm": print("smoothed" if nb_or_svm[1] else "unsmoothed", end=" ") print("stemmed" if stemming else "unstemmed", \ "presence" if binary else "frequency", \ "unigrams" if ngrams == (1, 1) else \ ("bigrams" if ngrams == (2, 2) else \ ("uni + bi" if ngrams == (1, 2) else "unknown")), \ "accuracy:", sum(accuracies)/len(accuracies))
def GetPddlObj(_sWord): global text2pddl; if text2pddl == None: ReadMinecraftDict(minecraftDictFile); setObjs = set(); p = PorterStemmer(); sLastWord = p.stem(_sWord.lower(), 0, len(_sWord)-1); if sLastWord == 'block': return setObjs; #print sLastWord; for sTextName in text2pddl.keys(): if text2pddl[sTextName] == 'NULL': continue; lstWords = sTextName.split(' '); sLastTextWord = lstWords[len(lstWords)-1]; if sLastTextWord == 'block': if len(lstWords) == 1: continue; sLastTextWord = lstWords[len(lstWords)-2]; if sLastTextWord == sLastWord: setObjs.add(text2pddl[sTextName]); return list(setObjs);
def stemizeList(normalList): # STEMIZER FOR LIST porter = PorterStemmer() newList = [] newDict = {} count = 0; for lists in normalList: tokenList = [] for token in lists: #print normalList.index(lists)," ",lists.index(token) tokenList.append(porter.stem(token,0,len(token)-1)) if token in newDict: count = newDict[token] newDict[token] = count +1 else: newDict[token] = 1 newList.append(tokenList) #token = porter.stem(token,0,len(token)-1) return newList,newDict
def get_postlist(stop_answer, stem_answer, dict_terms): if stop_answer == 'no': stopwords = [] if stop_answer == 'yes': stopwords = [ 'i', 'a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'what', 'when', 'where', 'who', 'will', 'with', 'the' ] ps = PorterStemmer() position_list = [] dict_posting = {} counter = 0 for key, value in dict_terms.items(): if isinstance(value, dict): for k, v in value.items(): if k == 'abstract': val = v.replace(',', '').lower().split() for index, word in enumerate(val): if stem_answer == 'no': stem_word = word if stem_answer == 'yes': stem_word = ps.stem(word, 0, len(word) - 1) if stem_word not in stopwords: if stem_word not in dict_posting: dict_posting[stem_word] = {} if key not in dict_posting[stem_word]: dict_posting[stem_word][key] = {} dict_posting[stem_word][key][ 'frequency'] = 0 dict_posting[stem_word][key][ 'position'] = [] dict_posting[stem_word][key]['frequency'] += 1 dict_posting[stem_word][key][ 'position'].append(index) with open('posting_list.json', 'w') as outfile: json.dump(dict_posting, outfile) print("Finished writing the posting list") return dict_posting
def __init__(self, fileA, fileB): self.__allWords = set() self.__wordsA = dict() self.__wordsB = dict() with open(fileA, 'r') as document: for line in document: words = line.strip().split() for word in words: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) if word in self.__wordsA.keys(): self.__wordsA[word] += 1 else: self.__wordsA[word] = 1 with open(fileB, 'r') as document: for line in document: words = line.strip().split() for word in words: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) if word in self.__wordsB.keys(): self.__wordsB[word] += 1 else: self.__wordsB[word] = 1 self.__allWords = set(self.__wordsA.keys()) | set(self.__wordsB.keys()) self.__table = {t[1]: t[0] for t in enumerate(self.__allWords)}
def ReadMinecraftDict(infoFile): global text2pddl; fin = open(infoFile) lines = fin.readlines() fin.close() lines = [ line.strip() for line in lines ]; text2pddl = {} p = PorterStemmer() for line in lines: if len(line)==0: continue; parts = line.split(':'); textName = parts[0]; words = [ p.stem(w.lower(),0,len(w)-1) for w in textName.split() ] textName = ' '.join(words); if len(parts)>1: pddlName = parts[1]; else: pddlName = parts[0]; assert(pddlName!=''); assert(textName!=''); text2pddl[textName] = pddlName;
def process(text): '''Returns a list of words after carying out the following text preprocessing and normalization steps''' # Convert text to lower case text = text.lower() #Remove 'Subject' text = re.sub(r'^sub(ject)?', ' ', text) # Strip HTML text = re.sub(r'<.*?>', ' ', text) # Normalize URLs text = re.sub(r'(http|https|ftp)://\S*', ' httpaddr ', text) # Normalize email addresses text = re.sub(r'[\w.+-]+@[\w.-]+', ' emailaddr ', text) # Normalize numbers text = re.sub(r'\b\d[\d,]*[.]*[\d]*\b', ' number ', text) # Normalize Dollars/Rupees text = re.sub(r'(\$|\brs\b|₹|£)+', ' dollar ', text) # Remove non-word characters text = re.sub(r'[^a-z]+', ' ', text) # Strip all whitespace characters and generate list of words # Stop Word Removal # stop_words = pickle.load(open('stopwords_set.pyset', 'rb')) text = [ word for word in text.split() if word not in process.stop_words and len(word) > 2 ] # Word Stemming p = PorterStemmer() result = [] for word in text: try: stem_word = p.stem(word, 0, len(word) - 1) if stem_word not in process.stop_words: result.append(stem_word) except: pass return result
def GetTFIDF(): emails = None x_vals = [] y_vals = [] stemmer = PorterStemmer() # Get email chains with open("balanced_chains.pickle", "rb") as fp1: # Unpickling emails = pickle.load(fp1) np.random.shuffle(emails) i = 0 text_data = [] for i in range(0, len(emails)): print "Evaluation Email %d" % (i) email, next_email, time_diff, bucket = emails[i] # if int(np.round(time_diff / 60)) > 72: # continue # Create stemmed body and append to text_data new_str = "" words = email['body'].split() for word in words: new_word = stemmer.stem(word, 0, len(word) - 1) new_str += new_word + " " new_str = new_str[:-1] text_data.append(new_str) # Append hour y_vals.append(int(np.round(time_diff / 60))) #y_vals.append(int(time_diff) b = np.array(y_vals) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(text_data) tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) return X_train_tf, b, count_vect, tf_transformer, text_data
def buildMatrix(self): # use suffix-stripping algorithm to stem word porter_strmmer = PorterStemmer() for index in range(0,len(self.origin_documents)): document = self.origin_documents[index] # change document in origin_document array to array of stemmed word self.origin_documents[index] = [porter_strmmer.stem(x, 0, len(x) - 1) for x in document.split()] # use 2000 most frequent words to generate words array temp_word = defaultdict(int) for document in self.origin_documents: for word in document: temp_word[word] += 1 sorted_dict = sorted(temp_word.items(), key=operator.itemgetter(1)) sorted_dict.reverse() self.words = [x[0] for x in sorted_dict[0:self.word_size]] # build document array for index in range(0, len(self.origin_documents)): document = self.origin_documents[index] self.documents.append([]) self.documents[index] = [document.count(word) for word in self.words] # print(self.documents[0], sum(self.documents[0])) # remove zero sum rows zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0] for value in zeros[::-1]: del self.labels[value] del self.documents[value] # zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0] print(len(self.origin_documents), len(self.words), len(self.documents), self.words)
def vectorSpaceModel(totalDocument, queryFileRead, stopwords): """ Query to calculate the cosine similarity between document d and Query Q """ # Loads the inverted File Hash dic = loadFromFile() # queryList = processQueryList(queryFileRead) # Calculation of Inverse Document Frequency IDF = calculateIDF(dic, totalDocument) # Calculation of Term Frequency TF = calculateTFList(dic) # Calculation of Wd from all the Term Frequency calculated WD = calculateWD(TF, totalDocument) pObj = PorterStemmer() fileWrite = open("outputdocument.txt", "w") for query in queryList: fileWrite.write("\n---------------------------------------------------------------------------------------") fileWrite.write("\nQuery: " + query) # Separate the string of query into list of words listQuery = re.findall(r'\w+', query) # Remove the stopwords and numbers from the list of query words queryWithoutStopword = [x for x in listQuery if x not in stopwords and x.isalpha()] # Stem the list of query words processedQuery = [pObj.stem(x.lower(), 0, len(x) - 1) for x in queryWithoutStopword] # Calculate the cosine measure (Similarity) for the query rankedDocList = calculateSimilarity(processedQuery, IDF, WD, totalDocument) fileWrite.write("\nTotal number of documents retrieved: " + str(len(rankedDocList))) fileWrite.write("\nDocument ID:\n") fileWrite.write(''.join(str(rankedDocList))) fileWrite.write("\n---------------------------------------------------------------------------------------") fileWrite.close() print "Writing to outputdocument.txt file completes."
def main(): # Reading the document from the file file = open("cran.all.1400", "r") documents = file.read() # Reading stop words from the file fileStopwords = open('stopwords.txt', 'r') stopwordsList = fileStopwords.read() stopwords = stopwordsList.split() # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency. documentList = [] docId = 1 # Splits the multiple documents of the same file into list document = re.split(".I | \n.I", documents)[1:] for doc in enumerate(document): startIndex = doc[1].index('.W\n') text = doc[1][startIndex + 3:] words = re.findall(r'\w+', text) pObj = PorterStemmer() listWords = {} for word in words: flagStopwords = word.lower() in stopwords if (not flagStopwords and word.isalpha()): stemWord = pObj.stem(word, 0, len(word) - 1) listWords = addToDict(listWords, stemWord) sortedList = sorted(listWords.items(), key=lambda t: t[0]) output = {'id': docId, 'unique': len(sortedList), 'terms': sortedList} docId += 1 documentList.append(output) for i in range(0, len(documentList)): print "Document:", documentList[i][ 'id'], "\nUnique Terms:", documentList[i][ 'unique'], "\nTerms:\n", documentList[i]['terms']
def main(): array = [] array2 = [] p = PorterStemmer() with open(sys.argv[1]) as f: for line in f: if len(line) > 1: array.append(line[0:len(line) - 1]) word_dictionary, tag_dictionary, count = read_input(array, p) with open(sys.argv[2]) as f: for line in f: if len(line) > 1: array2.append(line[0:len(line) - 1]) read_test_data(array2, word_dictionary, tag_dictionary, count)
def main(): # Reading the document from the file file = open("cran.all.1400", "r") documents = file.read() # Reading stop words from the file fileStopwords = open('stopwords.txt', 'r') stopwordsList = fileStopwords.read() stopwords = stopwordsList.split() # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency. docId = 1 #InvFileHash invFileHash = {} # Splits the multiple documents of the same file into list document = re.split(".I | \n.I", documents)[1:] for doc in enumerate(document): startIndex = doc[1].index('.W\n') text = doc[1][startIndex + 3:] words = re.findall(r'\w+', text) pObj = PorterStemmer() listWords = {} for word in words: flagStopwords = word.lower() in stopwords if (not flagStopwords and word.isalpha()): stemWord = pObj.stem(word, 0, len(word) - 1) listWords = addToDict(listWords, stemWord) docList = addDoc(docId, listWords) docId += 1 invFileHash = createInvFileHash(invFileHash, docList) print "File written: output.json" print "Number of terms:",len(invFileHash) writeToFile(invFileHash)
def __init__(self): """ Author: Nicole This method declares the list stopwords, dictionaries all_words and all_words_freq, as well as the PorterStemmer object. """ self.stopwords = [] self.p = PorterStemmer() self.all_words = {} self.all_words_freq = {} self.tfidf = {} self.vocabulary = [] self.doc_term_matrix = [[0] * 23 for n in range(809)] self.docs = {} self.visited = []
class Cleaner(object): def __init__(self, stopwords): self.stopwords = stopwords self.stemmer = PorterStemmer() def clean_word(self, word): word = word.strip().lower() word = filter(lambda c: c.isalnum(), word) if word in self.stopwords: word = None else: word = self.stemmer.stem(word, 0, len(word) - 1) return word def clean_wordlist(self, wordlist): wordlist = " ".join(wordlist).replace('-', ' ').split() clean_list = map(lambda x: self.clean_word(x), wordlist) return [word for word in clean_list if word] @staticmethod def make_printable(phrase): return filter(lambda c: c in string.printable, phrase)
class Indexer(): # remove stop words and do stemming STOP_WORD_LIST = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","just","keep","keeps","kept","know","knows","known","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","que","quite","qv","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","value","various","very","via","viz","vs","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","zero"] def __init__(self): logging.debug('Indexer => Init params:self') self.idx_fields = [] # field of document to be indexed #self.STOP_WORD_LIST = [] self.P = PorterStemmer() # end of function ''' def set_stop_words(self,stop_word_list): self.STOP_WORD_LIST = stop_word_list # end of function ''' def set_idx_fields(self,fields): logging.debug('Indexer => set_idx_fields fields:' + str(fields)) self.idx_fields = fields def add_idx_field(self,field_name): self.idx_fields.append(field_name) def clean(self,word): #preprocess word word = word.lower() word = word.strip("\n\t,.(){}?!;'") if word not in self.STOP_WORD_LIST: word = self.P.stem(word,0,len(word)-1) else: word = "" return word # end of function def tokenize(self, text): #list word_idx = [] # split lines lines = text.split('\n') for line in lines: # split words words = line.split(' ') for word in words: word = self.clean(word) if len(word) > 1: word_idx.append(word) # make a set (remove duplicate) word_idx = set(word_idx) return word_idx # end of function def index(self, document): if isinstance(document,list): document = document[0] text = "" # get text from document to be indexed for field in self.idx_fields: text += document[field] + " " return self.tokenize(text) def stem(self, words): return [self.tokenize(word) for word in words]
def __init__(self): logging.debug('Indexer => Init params:self') self.idx_fields = [] # field of document to be indexed #self.STOP_WORD_LIST = [] self.P = PorterStemmer()
def stemming(self, term): output = "" stem1 = PorterStemmer() output = stem1.stem(term) return output
def stemWord(self, word): return PorterStemmer.stem(self, word, 0, len(word) - 1)
def __init__(self, language): PorterStemmer.__init__(self)
class W2V: def __init__(self, path): self.stemmer = PorterStemmer() self.path = path self.cur_idx = 0 self.batch = 2 self.sample = 0.001 self.vocab_size = 10000 self.total_count = 0 self.word_count = Counter() self.word2idx = defaultdict(int) self.idx2word = {} self.word_sample = {} self.batch_size = 128 self.embedding_size = 128 # Dimension of the embedding vector. self.skip_window = 3 # How many words to consider left and right. self.raw_sample_probs = [0.5, 0.3, 0.2] self.sample_probs = [] sum = 0 for prob in self.raw_sample_probs: sum += prob self.sample_probs.append(sum) self.num_skips = 2 # How many times to reuse an input to generate a label. self.valid_size = 16 # Random set of words to evaluate similarity on. self.valid_window = 100 # Only pick dev samples in the head of the distribution. #self.valid_examples = np.random.choice(valid_window, valid_size, replace=False) self.num_negative_sampled = 64 # Number of negative examples to sample. self.batch_index = 0 self.get_stat() def get_stat(self): import json import collections import os.path filename = "stat" if os.path.exists(filename): f = open(filename, 'rb') obj = pickle.load(f) self.word2idx = obj["word2idx"] self.idx2word = obj["idx2word"] self.word_count = obj["word_count"] self.word_sample = obj["word_sample"] self.total_count = obj["total_count"] return line_idx = 0 text_data = "" with open(self.path, "r") as ins: for line in ins: obj = json.loads(line) reviewText = obj["reviewText"] summary = obj["summary"] reviewerID = obj["reviewerID"] overall = obj["overall"] asin = obj["asin"] text_data = " ".join([text_data ,reviewText, summary]) line_idx += 1 if line_idx % 1000 == 0: self.word_count + collections.Counter(self.stemmer.get_stem_wordlist(text_data.split())) text_data = "" self.word_count = self.word_count.most_common(self.vocab_size-1) for word,cnt in self.word_count: self.word2idx[word] = 1+len(self.word2idx) #self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys())) #calculate the sample #threshold_count = self.sample * self.total_count #for word in self.word_count: # word_probability = (sqrt(self.word_count[word] / threshold_count) + 1) * (threshold_count / self.word_count[word]) # self.word_sample[word] = int(round(word_probability * 2**32)) f = open(filename, 'wb') pickle.dump({"word2idx":self.word2idx,"idx2word":self.idx2word,"word_count":self.word_count,"word_sample":self.word_sample,"total_count":self.total_count},f) def get_batch(self): #global batch_index #calculate sample prob inside window #sample_probs = [] #sum = (1<<(self.skip_window)) - 1 #prob = 0 #for idx in range(self.skip_window,0,-1): # prob += (1<<(idx-1)) / sum # sample_probs.append(prob) #prepare buffer span = 2 * self.skip_window + 1 # [ skip_window target skip_window ] buffer = collections.deque(maxlen=span) context_data = [] #list append is better than numpy append, so using list append first and then convert into numpy obj target_data = [] # same as above for i in range(1, self.batch_size): idx = self.batch_index + i #line = linecache.getline(self.path, idx) line = '{ "reviewerID": "A2SUAM1J3GNN3B", "asin": "0000013714", "reviewerName": "J. McDonald", "helpful": [2, 3], "reviewText": "I bought this for my husband who plays the piano. He is having a wonderful time playing these old hymns. The music is at times hard to read because we think the book was published for singing from more than playing from. Great purchase though!", "overall": 5.0, "summary": "Heavenly Highway Hymns", "unixReviewTime": 1252800000, "reviewTime": "09 13, 2009" }' if line is None or len(line) == 0: print("current idx,", idx, " current batch_idx, ", self.batch_index, " line: ", line) continue obj = json.loads(line) reviewText = obj["reviewText"] summary = obj["summary"] reviewerID = obj["reviewerID"] overall = obj["overall"] asin = obj["asin"] text_data = " ".join([reviewText, summary]).split() for word_idx in range(0, len(text_data)): while len(buffer) < span: buffer.append(self.word2idx[self.stemmer.get_stem_word(text_data[word_idx])]) target_word = self.word2idx[buffer[self.skip_window]] context_word = target_word avoid_context_word = [target_word] r = rd.random() for cnt_idx in range(0, int(self.skip_window/2)): #random pick up skip_window/2 context word while context_word in avoid_context_word: #for avoid repeat sample for rank_idx in range(0,self.skip_window): #from closest to farest if r <= self.sample_probs[rank_idx]: if rd.random() >= 0.5: context_word = self.word2idx[buffer[self.skip_window - (rank_idx + 1)]] else: context_word = self.word2idx[buffer[self.skip_window + (rank_idx + 1)]] break if context_word not in avoid_context_word: avoid_context_word.append(context_word) context_data.append(context_word) target_data.append(target_word) #for next word buffer.append(self.stemmer.get_stem_word(text_data[word_idx])) #update global batch_index self.batch_index += self.batch_size context_data = np.ndarray(shape=(len(context_data)), dtype=np.int32) target_data = np.ndarray(shape=(len(target_data), 1), dtype=np.int32) return context_data, target_data def train(self): embedding_size = 128 # Dimension of the embedding vector. graph = tf.Graph() with graph.as_default(): # Input data. train_inputs = tf.placeholder(tf.int32) train_labels = tf.placeholder(tf.int32) # Ops and variables pinned to the CPU because of missing GPU implementation with tf.device('/cpu:0'): # Look up embeddings for inputs. embeddings = tf.Variable( tf.random_uniform([self.vocab_size, embedding_size], -1.0, 1.0), name = "emb") embed = tf.nn.embedding_lookup(embeddings, train_inputs) # Construct the variables for the NCE loss nce_weights = tf.Variable( tf.truncated_normal([self.vocab_size, embedding_size], stddev=1.0 / sqrt(embedding_size))) nce_biases = tf.Variable(tf.zeros([self.vocab_size])) # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. loss = tf.reduce_mean( tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels, self.num_negative_sampled, self.vocab_size)) # Construct the SGD optimizer using a learning rate of 1.0. optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) saver = tf.train.Saver() init = tf.initialize_all_variables() self.num_steps = 1000000 with tf.Session(graph=graph) as session: # We must initialize all variables before we use them. init.run() print("Initialized") average_loss = 0 for step in range(self.num_steps): batch_inputs, batch_labels = self.get_batch() feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} # We perform one update step by evaluating the optimizer op (including it # in the list of returned values for session.run() _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict) average_loss += loss_val if step % 2000 == 0: if step > 0: average_loss /= 50 # The average loss is an estimate of the loss over the last 2000 batches. print("Average loss at step ", step, ": ", average_loss) filename = "_".join(["embedding",str(step)]) saver.save(session, filename) average_loss = 0 return
iter_ = regexObject.finditer(text) for result in iter_: if result.group('id') is not None: docs.append(Doc(int(result.group('id')))) # add new doc to the `list` with the id else: # as far as we know, not both of these can be `not none` at the same time if result.group('title') is not None: docs[-1].appendText(result.group('title').lower() + '\n') # append to the last element docs `list` if result.group('body') is not None: docs[-1].appendText(result.group('body').lower()) # append to the last element docs `list` return docs stopWords = {'a', 'all', 'an', 'and', 'any', 'are', 'as', 'be', 'been', 'but', 'by ', 'few', 'for', 'have', 'he', 'her', 'here', 'him', 'his', 'how', 'i', 'in', 'is', 'it', 'its', 'many', 'me', 'my', 'none', 'of', 'on ', 'or', 'our', 'she', 'some', 'the', 'their', 'them', 'there', 'they', 'that ', 'this', 'us', 'was', 'what', 'when', 'where', 'which', 'who', 'why', 'will', 'with', 'you', 'your'} portertStemmer = PorterStemmer() ro= re.compile(r'\d+\.+\d+|\w+') """ regex object for finding words and digits and floats containing dot IN them pre-cooked ready to eat""" def regexStyleGenerator(text): """ A generator that returns tokens of given text makes stopword removel and stemming :param text: the text to tokenize :return: None but yields tokens """ global total_num_words_before,total_num_words_after # for statistics
def run_train_test(training_file, testing_file): # Set the variables, params, dicts, sets alpha = 0.5 stop_words = {'the', 'and'} logic_negation = {'t', 'not', 'no', 'never', 'dont', 'didnt', 'doesnt'} Porter_Stemmer = PorterStemmer() # Import training dataset training_start_time = time.time() vocab = set(['positive-words', 'negative-words']) wordcount_class_0 = {'positive-words': 0, 'negative-words': 0} wordcount_class_1 = {'positive-words': 0, 'negative-words': 0} total_reviews = 0 reviewscount_0 = 0 reviewscount_1 = 0 train_labels = [] train_reviews = [] with training_file as f: for line in f: review, label = line.split(',') words = review.split(' ') del words[-1] label = int(label.strip("\n")) total_reviews += 1 # Implement negation: add NOT_ to words after logical negation for i in range(len(words)): words[i] = Porter_Stemmer.stem(words[i]) if words[i] in logic_negation: try: words[i + 1] = 'NOT_' + words[i + 1] except: continue try: words[i + 2] = 'NOT_' + words[i + 2] except: continue try: words[i + 3] = 'NOT_' + words[i + 3] except: continue bigrams = [] for i in range(len(words) - 1): bigram = words[i] + ' ' + words[i + 1] bigrams.append(bigram) words = set(bigrams) # words = set(words) vocab.update(words) for word in words: if word not in wordcount_class_0.keys(): wordcount_class_0[word] = 0 wordcount_class_1[word] = 0 if label == 0: reviewscount_0 += 1 for word in words: wordcount_class_0[word] += 1 # # Analyze Sentiment lexicons # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram1 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 # if unigram2 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram2 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 if label == 1: reviewscount_1 += 1 for word in words: wordcount_class_1[word] += 1 # # Analyze Sentiment lexicons # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram1 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 # if unigram2 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram2 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 train_labels.append(label) train_reviews.append(words) # Compute CPTs P_class = [0, 0] P_class[0] = reviewscount_0 / total_reviews P_class[1] = reviewscount_1 / total_reviews P_words_class_0 = {} P_words_class_1 = {} bottom_0 = sum(wordcount_class_0.values()) + alpha * len(vocab) bottom_1 = sum(wordcount_class_1.values()) + alpha * len(vocab) for word in vocab: if word in stop_words: P_words_class_0[word] = (0 + alpha) / bottom_0 P_words_class_1[word] = (0 + alpha) / bottom_1 else: P_words_class_0[word] = (wordcount_class_0[word] + alpha) / bottom_0 P_words_class_1[word] = (wordcount_class_1[word] + alpha) / bottom_1 # Inference on the training dataset predict_train_labels = [] for doc in train_reviews: log_sum_0 = 0 log_sum_1 = 0 bag_of_words = set(doc) for word in bag_of_words: log_sum_0 += log(P_words_class_0[word]) log_sum_1 += log(P_words_class_1[word]) # # Sentiment Analysis # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram1 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) # if unigram2 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram2 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) Prob_c0 = log(P_class[0]) + log_sum_0 Prob_c1 = log(P_class[1]) + log_sum_1 if Prob_c0 > Prob_c1: c = 0 else: c = 1 predict_train_labels.append(c) # Compute training accuracy correct = 0 for i in range(len(train_labels)): if predict_train_labels[i] == train_labels[i]: correct += 1 train_accuracy = correct / len(train_labels) training_time = time.time() - training_start_time # Import testing dataset testing_start_time = time.time() test_reviews = [] test_labels = [] with testing_file as f: for line in f: review, label = line.split(',') words = review.split(' ') del words[-1] label = int(label.strip("\n")) # Implement negation: add NOT_ to words after logical negation for i in range(len(words)): words[i] = Porter_Stemmer.stem(words[i]) if words[i] in logic_negation: try: words[i + 1] = 'NOT_' + words[i + 1] except: continue try: words[i + 2] = 'NOT_' + words[i + 2] except: continue try: words[i + 3] = 'NOT_' + words[i + 3] except: continue bigrams = [] for i in range(len(words) - 1): bigram = words[i] + ' ' + words[i + 1] bigrams.append(bigram) words = set(bigrams) # words = set(words) test_labels.append(label) test_reviews.append(words) # Inference on the testing dataset predict_test_labels = [] for doc in test_reviews: log_sum_0 = 0 log_sum_1 = 0 bag_of_words = set(doc) bag_of_words = vocab.intersection(bag_of_words) for word in bag_of_words: log_sum_0 += log(P_words_class_0[word]) log_sum_1 += log(P_words_class_1[word]) # # Sentiment Analysis # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram1 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) # if unigram2 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram2 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) Prob_c0 = log(P_class[0]) + log_sum_0 Prob_c1 = log(P_class[1]) + log_sum_1 if Prob_c0 > Prob_c1: c = 0 else: c = 1 # print(c) predict_test_labels.append(c) # Compute testing accuracy correct = 0 for i in range(len(test_labels)): if predict_test_labels[i] == test_labels[i]: correct += 1 test_accuracy = correct / len(test_labels) # Print results testing_time = time.time() - testing_start_time print(round(training_time), 'seconds (training)') print(round(testing_time), 'seconds (labeling)') print(round(train_accuracy, 3), '(training)') print(round(test_accuracy, 3), '(testing)') print(len(vocab)) return
'browser' : 1 }, 'file' : { 'ls-time': 1 }, 'ls' : { 'ls-time': 1, 'tag-time': 1 }, 'refspec': { 'new-branch-push': 2 } } computed = {} stemmer = PorterStemmer() for word in sys.argv[1:]: word = stemmer.stem(word.lower()) print word if word in synonymMap: word = synonymMap[word] if word in weightMap: for key, value in weightMap[word].iteritems(): if key in computed: computed[key] += value else: computed[key] = value sorted_computed = sorted(computed.iteritems(), key=operator.itemgetter(1))
from heapq import heappush, heappop from collections import defaultdict from stemmer import PorterStemmer from pairwise import pairwise here = lambda *x: os.path.abspath(os.path.join(os.path.dirname(__file__), *x)) stopwords_file = here('stopwords', ) index_file = here('index',) lengths_file = here('lengths', ) top = 20 lambda_ = 0.5 query = sys.argv[1:] inner_query = [] p = PorterStemmer() index = {} count_term = {} start_time = time.time() with open(stopwords_file, "r") as file: stopwords = map(lambda line: line.strip(), file.readlines()) with open(index_file, "r") as index_file: lines = index_file.readlines() for line in lines: entry = line.split(" ") documents = entry[2:] dictionary = defaultdict(int) for document, count in pairwise(documents):
def __init__(self, stopwords): self.stopwords = stopwords self.stemmer = PorterStemmer()
def stem(word): p = PorterStemmer() return p.stem(word, 0, len(word)-1)
DATASETS = "datasets" TRAINING = DATASETS + "/training.txt" VALIDATION = DATASETS + "/validation.txt" SPORTS = 2 POLITICS = 1 HASHTAGS = re.compile("\#([^\s\,\.\#\"\'\+\=\|\$\%\^\:]+)") URLS1 = re.compile("https?\:\/\/([^\s]+)") URLS2 = re.compile("www\.([^\s]+)") REFS = re.compile("\@([^\s\,\.\#\"\'\+\=\|\$\%\^\:\-]+)") KEYWORDS = re.compile("(\w+)") WHITESPACE = re.compile( "[\s\.\,\'\"\[\]\{\}\;\:\/\&\=\+\-\)\(\*\&\^\%\$\`\|\?\!]+") STEMMER = PorterStemmer() idtable = {} def get_id(table, key, write=True): if table.has_key(key): return table[key] else: if write: table[key] = len(table) return table[key] else: return None def register(table, keys, write=True):
class Crawler: def __init__(self): """ Author: Nicole This method declares the list stopwords, dictionaries all_words and all_words_freq, as well as the PorterStemmer object. """ self.stopwords = [] self.p = PorterStemmer() self.all_words = {} self.all_words_freq = {} self.tfidf = {} self.vocabulary = [] self.doc_term_matrix = [[0] * 23 for n in range(809)] self.docs = {} self.visited = [] def clean_url(self, url) : """ Author: Jason This method removes the base url EX. http://lyle.smu.edu/~fmoore/schedule.htm => schedule.htm """ url = re.compile(_ROOT_).sub('', url) url = re.compile('http://lyle.smu.edu/~fmoore').sub('', url) url = re.compile('index.*').sub('', url) url = re.compile('.*.gif').sub('', url) return re.compile('\.\./').sub('', url) def fetch(self, url) : """ Author: Jason This method will fetch the contents of the page. """ r = requests.get(urlparse.urljoin(_ROOT_, self.clean_url(url))) return r.text def extract_urls(self, text) : """ Author: Jason This method will take the contents of a page and extract all of the URLs on it """ urls = [] soup = BeautifulSoup(text, 'html.parser') for atag in soup.find_all('a'): urls.append(atag.get('href')) for img in soup.find_all('img'): urls.append(img.get('src')) return urls def external_link(self, url) : """ Author: Jason This method will check if the URL is an external link outside the root domain """ if url : url = re.compile('https*://').sub('', url) if re.compile('.*lyle.smu.edu/~fmoore.*').match(url) : return False elif re.compile('www.*').match(url) : return True elif re.compile('java-source.*').match(url) : return True elif re.compile('.*smu.edu.*').match(url) : return True elif re.compile('.*.aspx').match(url) : return True elif re.compile('mailto:').match(url) : return True elif re.compile('.*.xlsx').match(url) : return False elif requests.get(_ROOT_ + url).status_code == 200 : return False elif self.jpeg_link(url) : return False else : return True else : return True def jpeg_link(self, url) : """ Author: Jason This method will check if the link is a JPEG """ return True if re.compile('.*.jpg').match(url) else False def broken_link(self, url) : """ Author: Jason This method will check if the link is broken. """ return False if requests.get(_ROOT_ + self.clean_url(url)).status_code == 200 else True def excel_link(self,url) : """ Author: Jason This method will check if the link is an excel file. """ return True if re.compile('.*.xlsx').match(url) else False def add_root_to_links(self, urls) : """ Author: Jason This method will add the root URL to all of the links for visual apperance """ new_urls = [_ROOT_ + re.compile('http://lyle.smu.edu/~fmoore/').sub('', link) for link in urls] return new_urls def remove_extra_whitespace(self, text) : """ Author: Nicole This method removes more than one white space between words. """ p = re.compile(r'\s+') return p.sub(' ', text) def remove_punctuation(self, text) : """ Author: Nicole This method uses regex to remove the punctuation in text. http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python """ exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def load_stop_words(self, file) : """ Author: Nicole This method stores the list of stopwords from a file to the class variable list self.stopwords. """ self.stopwords = [line.rstrip(' \n').lower() for line in open(file)] def prepare_text(self, text) : """ Author: Nicole This method prepares the raw HTML text for it to be indexed by lowering the letters, removing the HTML tags, removing the punctuation, removing the extra white space, changing the list to ASCII from unicode, removing the stop words, and stemming each word. """ text = strip_tags(text.lower()) text = self.remove_punctuation(text) text = self.remove_extra_whitespace(text) text = [word.encode('UTF8') for word in text.split()] text = [word for word in text if word not in self.stopwords] text = self.p.stem_word(text) return text def add_root_if_not_there(self, url) : """ Author: Jason This method will add the root url to a single link if it isnt there """ url = re.compile('http://lyle.smu.edu/~fmoore/').sub('', url) return _ROOT_ + url def index(self, url, doc_words) : """ Author: Nicole This method indexes all the words in a document and keeps track of the frequency of a word in overall documents and overall occurrences. """ for key in doc_words : if key not in self.all_words: self.all_words[key] = [(url, doc_words[key])] self.all_words_freq[key] = [1, doc_words[key]] self.vocabulary.append(key) # [word][docID] = word_freq # print '['+str(self.vocabulary.index(key))+']['+str(self.docs[self.add_root_if_not_there(url)])+'] = ' + str(self.all_words[key][0][1]) self.doc_term_matrix[self.vocabulary.index(key)][self.docs[self.add_root_if_not_there(url)]] = self.all_words[key][0][1] else: self.all_words[key].append((url, doc_words[key])) self.all_words_freq[key][0] += 1 self.all_words_freq[key][1] += doc_words[key] for tup in self.all_words[key] : if tup[0] == str(url) : self.doc_term_matrix[self.vocabulary.index(key)][self.docs[self.add_root_if_not_there(url)]] = tup[1] def calTFIDF(self, word, visited) : """ Author: Nicole This method will calculate the TF-IDF for a given word. 1 + log(number of times word appears in a document) * log(total documents/ how many documents the word appears in) """ if word in self.all_words: for i in self.all_words[word] : return (1 + log10(i[1])) * log10(len(visited)/self.all_words_freq[word][0]) else : return 0 def write_output(self, visited, external, jpeg, broken, dictionary) : """ Author: Jason and Nicole but mostly Jason except for lines 211 - 213 This method will write the output of the crawler and the 20 most common words to output.txt """ dictionary = sorted(dictionary.items(), key=lambda e: e[1][1], reverse=True)[:20] f = open('output.txt', 'w') f.write('Output of Jason and Nicole\'s web crawler.\n') f.write('Current Time: ') f.write(strftime("%Y-%m-%d %H:%M:%S", localtime())) f.write('\n\n') # Visited links f.write('Visited Links: (' + str(len(visited)) + ' total)\n') for link in visited : f.write(link + '\n') f.write('\n') # External links f.write('External Links: (' + str(len(external)) + ' total)\n') for link in external : f.write(link + '\n') f.write('\n') # JPEG links f.write('JPEG Links: (' + str(len(jpeg)) + ' total)\n') for link in jpeg : f.write(link + '\n') f.write('\n') # Broken links f.write('Broken Links: (' + str(len(broken)) + ' total)\n') for link in broken : f.write(link + '\n') f.write('\n') # Term Frequency f.write('Top 20 Most Common Words with Document Frequency\n') for i in dictionary: f.write('The term ' + i[0] + ' occurs ' + str(i[1][1]) + ' times in ' + str(i[1][0]) + ' documents.\n') f.close() f = open('term_document_frequency_matrix.txt', 'w') f.write('Term/Document Frequency Matrix for Jason and Nicole\'s web crawler.\n') f.write('Current Time: ') f.write(strftime("%Y-%m-%d %H:%M:%S", localtime())) f.write('\n\n ') # 15 spaces for key, val in self.docs.iteritems() : f.write('{0:60}'.format(key)) f.write('\n') for i in range(0,len(self.vocabulary)) : f.write('{0:15}'.format(self.vocabulary[i])) for j in range(0,23) : f.write('{}'.format(self.appears(self.doc_term_matrix[i][j])).ljust(60)) f.write('\n') f.close() def appears(self, i) : """ Author: Jason This method will return 1 if the frequency (i) is greater than 1. It is used for writing the term/document frequency matrix """ if i >= 1 : return 1 else: return 0 def clean_external_links(self, external) : """ Author: Jason This method will cremove the non links from the external links """ urls = [] for link in external : if link == None : return urls else : urls.append(link) return urls def query_engine(self, N): """ Author: Jason This method will be the main query handler. self.all_words format (var info below): [('spring'), [('url', 3), ('other_page', 4)] ] word tuples(url, frequency) """ print "#################################################################" print "################ Jason and Nicoles' Web Crawler #################" print "#################################################################" print print "Please enter a query to search the lyle.smu.edu/~fmoore domain." print "Search will display top " + str(N) + " results or all results that query is found on." print "Type 'quit' to exit the search engine" user_input = "" while True : user_input = raw_input("> ") if user_input == "quit" or user_input == "Quit" or user_input == "QUIT": break query = self.p.stem_word(re.sub("[^\w]", " ", user_input).split()) query = [word.lower() for word in query] for word in query : if word in self.stopwords : query.remove(word) query_vector = [self.calTFIDF(word, self.visited) for word in query] docs = {} for doc_name, ID in self.docs.iteritems() : vector = [] for word in query : if word in self.vocabulary : if self.doc_term_matrix[self.vocabulary.index(word)][self.docs[self.add_root_if_not_there(doc_name)]] >= 1 : vector.append(1) else : vector.append(0) docs[doc_name] = self.normalize_vector(vector) rankings = {} for url, doc_vec in docs.iteritems() : rankings[url] = self.calculate_cosine_similarity(doc_vec, query_vector) sorted_rankings = sorted(rankings.items(), key=operator.itemgetter(1), reverse=True) i = 0 if sorted_rankings[0][1] == 0.0 : print '%s not found in domain.\n' % user_input continue print ' Score: Document:' while i < N : if sorted_rankings[i][1] == 0.0 : break print ' {0:4f}'.format(sorted_rankings[i][1]) + ' {}'.format(sorted_rankings[i][0]) i += 1 print return def normalize_vector(self, vector) : """ Author: Jason This method will nomalize the vector to prep for calculate_cosine_similarity """ if numpy.linalg.norm(vector) == 0.0 : return [0.0 for i in vector] else : return [i / numpy.linalg.norm(vector) for i in vector] def calculate_cosine_similarity(self, doc, query) : """ Author: Jason This method will calculate the cosine similarity betwee two vectors of equal size """ if len(doc) != len(query) : return 0.0 return numpy.dot(doc,query) def crawl(self, pages_to_index) : """ Author: Jason and Nicole This is the main worker method. It will parse the urls, add the words to the index, get the next links, and continue looping through the queue until the number of pages to index is met. """ parser = robotparser.RobotFileParser() parser.set_url(urlparse.urljoin(_ROOT_, 'robots.txt')) parser.read() # Add _ROOT_ url to queue urlqueue = [_ROOT_] # visited, external, jpeg, and broken links visited, external, jpeg, broken = [], [], [], [] # pages indexd pages_indexed = 0 while urlqueue: # get flast element in urlqueue url = urlqueue.pop(-1) if self.clean_url(url) in visited: continue # check if we can fetch the page first if parser.can_fetch('*', urlparse.urljoin('/', url)) : # remove the / at the beginning of the string url = re.compile('^/').sub('',url) # fetch the page page = self.fetch(url) # add page to visited links visited.append(self.clean_url(url)) # get urls from page new_urls = self.extract_urls(page) for new_url in new_urls : # check if we have already visited it or are going to if new_url not in visited and new_url not in urlqueue and new_url not in jpeg and new_url not in broken and new_url not in external : if self.external_link(new_url) : external.append(new_url) elif self.jpeg_link(new_url) : jpeg.append(new_url) elif self.excel_link(new_url) : visited.append(new_url) elif self.broken_link(new_url) : broken.append(new_url) else : urlqueue.append(new_url) # docs and page id self.docs[self.add_root_if_not_there(url)] = pages_indexed # checks to see if url is parsable aka .html, .htm, .txt # if yes, then parse and index; if no, pass filename, file_extension = os.path.splitext(url) if not (file_extension == ".pdf" or file_extension == ".pptx") : pagetext = requests.get(_ROOT_ + self.clean_url(url)) pagetext = pagetext.text cleantext = self.prepare_text(pagetext) doc_words = Counter(cleantext) self.index(url, doc_words) # increment the pages indexed pages_indexed += 1 if int(pages_indexed) >= int(pages_to_index): break # end if # end while # clean the links for visual appearance visited = set(self.add_root_to_links(visited)) self.visited = visited jpeg = self.add_root_to_links(jpeg) broken = self.add_root_to_links(broken) external = self.clean_external_links(external) # write to output file self.write_output(visited, external, jpeg, broken, self.all_words_freq) # query engine with N=5 self.query_engine(5)
#!/usr/bin/env python import sys sys.path.append('/home/yipei/Twitter/FeatureExtraction/code/util') from collections import defaultdict import os.path as path from stemmer import PorterStemmer import re import TwitterParser as Tparse stemmer = PorterStemmer() filelist = sys.argv[1] outputdir = sys.argv[2] for line in open(filelist): line = line.strip() fin = open(line, 'r') clip = path.basename(line).split(".")[0] print "process ", clip boffile = path.join(outputdir, clip+'.bof') fout=open(boffile, 'w') wordDict=defaultdict(int) wordDict.clear() #go through all tweets and count for the number of each term while True:
def GetDataset(): np.random.shuffle(emails) x_vals = [] y_vals = [] stemmer = PorterStemmer() word_mapping = GetWordDictionary(emails) i = 0 text_data = [] for i in range(0, len(emails)): #print "Evaluation Email %d" % (i) # note: time diff in mins email, next_email, time_diff, label = emails[i] # Create feature array features = [] #Feature 1: Number of to features.append(len(email['to'])) # Feature 2: Num words words = email['body'].split() lower_case_body = [ stemmer.stem(x.lower(), 0, len(x) - 1) for x in words ] features.append(len(words)) # Feature 3: Number of CC features.append(email['cc_count']) # Feature 4: is reply if email['is_re']: features.append(1) else: features.append(0) # Feature 5: Time of Day (hour) date = email['date']['local_date'] hour = date.hour features.append(hour) # Feature 6: Length of Subject Line subject_words = email['subject'].split() lower_case_subject = [ stemmer.stem(x.lower(), 0, len(x) - 1) for x in subject_words ] features.append(len(subject_words)) # Feature 7: Day of Week features.append(date.weekday()) # Feature 8: # Question marks in Body, bool in body features.append(email['body'].count('?')) # Feature 9: # Question marks in Subject, bool in subject features.append(email['subject'].count('?')) # NEW FEATURES # boolean: presence of ? in body / header features.append(1 if '?' in email['body'] else 0) features.append(1 if '?' in email['subject'] else 0) # Feature 12-13: "RESPONSE NEEDED" in subject or body keywords = ['response', 'please', 'can', 'urgent', 'important', 'need'] for keyword in keywords: stemmed_keyword = stemmer.stem(keyword, 0, len(keyword) - 1) features.append(1 if stemmed_keyword in lower_case_subject else 0) features.append(1 if stemmed_keyword in lower_case_body else 0) x_vals.append(features) y_vals.append(label) X = np.array(x_vals) Y = np.array(y_vals) return X, Y
import glob import os import re import string from bs4 import BeautifulSoup from stemmer import PorterStemmer here = lambda *x: os.path.abspath(os.path.join(os.path.dirname(__file__), *x)) data_dir = here('data', ) corpus_dir = here('corpus', ) total_files = 0 p = PorterStemmer() if not os.path.exists(corpus_dir): os.makedirs(corpus_dir) os.chdir(data_dir) for file in glob.glob('*.sgm'): current_file = os.path.join(data_dir, file) print 'Extract files from file %s' % current_file soup = BeautifulSoup(open(current_file)) for document in soup.find_all('reuters'): new_file = os.path.join(corpus_dir, document.get('newid')) with open(new_file, "wb") as extracted_file: read_data = document.get_text().encode('utf-8') clean_data = re.sub(r'/', ' / ', read_data) clean_data = re.sub(r'-', ' - ', clean_data) """ The punctuations contained in the string.punctuation are