def process_words(words): # Convert to lower case words = words.strip() words = words.lower() # replace abbreviations with full words and do word stemming splited = words.split(" ") processed_words = [] tpes = [] for word in splited: if word not in ['[addr]', '[logo]', '[supplier]']: word = word.strip(" ,._+=!@%^&*:;/?<>()[]{}|'").rstrip("$") if len(word) <= 1 and not word.isdigit() and word is not "#": continue tpe = get_type(word) if word in abbrev_dict: processed_word = stem(abbrev_dict[word]) elif stem(word) in abbrev_dict: processed_word = stem(abbrev_dict[stem(word)]) else: processed_word = stem(word) if (tpe != 'text'): processed_word = tpe if processed_word not in sw: processed_words.append(processed_word.strip("#")) tpes.append(tpe) return (' '.join(processed_words), ','.join(tpes))
def add_stemmered(word): porter_stemmered = open("porter_stemmered.txt", "w") for line in open(word): line = line.strip().split('\t') if line[0] != '': print (line[0] + '\t' + line[1] + '\t' + stem(line[1])) porter_stemmered.write(line[0] + '\t' + line[1] + '\t' + stem(line[1]) + '\n') porter_stemmered.close()
def main(): try: k = int(sys.argv[2]) fileName = sys.argv[1] wordsDict = createWordsDict(fileName) wordsDict = sorted(wordsDict.items(), key=itemgetter(1), reverse=True)[:k] # print '\nHi' for tuple in wordsDict: for token in tuple: print porter2.stem(token) break except: print "Error"
def stem_phrase(phrase): words = phrase.split() for i in xrange(0, len(words)): words [i] = porter2.stem(words[i]) return ' '.join(words)
def getFeatures(attraction,title,bodyText,labels,stopwords): features = defaultdict() for word in attraction: features[('attraction_word',word)]=1 titleCount=0 for word in title: if word.lower() not in stopwords: features[('title_word',word.lower())]=1 if word.lower() in labels: features[('title_label',word.lower())]=1 if titleCount==0: features[('first_word',word)]=1 titleCount+=1 counter = 0 previousWord = '' for word in bodyText: if word.lower() not in stopwords and word not in string.punctuation: features[('body_word',word)]=1 features[('body_word_stemmed',stem(word).lower())]=1 # if postags[counter][1]=='JJ': # features[('body_jj',word)]=1 # if postags[counter][1]=='NN': # features[('body_nn',word)]=1 if word.lower() in labels: features[('body_label',word.lower())]=1 # if counter != 0: # features[('bigram',previousWord.lower()+word.lower())]=1 previousWord=word # for key,value in gazeteer.iteritems(): # if word.lower() in value: # features[(key+'gazeteer',word.lower())]=1 counter+=1 features[('length_review',len(bodyText))]=1 return features
def pre_process(query, stem = True): listterm = list() if stem: listterm = [porter2.stem(term) for term in query.split()] else: listterm = query.split() return listterm
def _stemDocs(self): for doc in self._dWords: new_doc = '' for word in doc: new_doc += porter2.stem(word) + ' ' self._doc_stem.append(new_doc) self._doc_stem_words.append(new_doc.split()) print ">>> Docs stemmed"
def stem(self, word): try: if self.stemming_algo == 'porter': return porter2.stem(word) elif self.stemming_algo == 'lovins': return lovins.stem(word) else: return paicehusk.stem(word) except Exception, e: pass
def performStemming(filteredReviewTxt): stemmedLst = [] for word in filteredReviewTxt: #print word if not word: continue else: stemmedword = porter2.stem(word) stemmedLst.append(stemmedword) stemmedOutput = str(' '.join(j for j in stemmedLst)) return stemmedOutput
def stem(caller, word): global _orengostemmer lang = getattr(caller, "lang", "en") if lang == "en": return porter2.stem(word) elif lang == "pt": if _orengostemmer is None: from ptstemmer.implementations.OrengoStemmer import OrengoStemmer _orengostemmer = OrengoStemmer() return _orengostemmer.getWordStem(word) else: return word
def Stemming(): filesList = glob.glob(dirPath) #ps = PorterStemmer() global listWord global noScanDocs, noStems for files in filesList: noScanDocs += 1 textFile = open(files, "r") words = textFile.read().lower() plainWord = re.sub('<[^>]*>', '', words) listWord = Tokenize(plainWord) for w in listWord: noStems += 1 stemWord[porter2.stem(w)] += 1
def getFeatures(attraction,title,bodyText,labels,stopwords): # dictionary to hold features features = defaultdict() # NOTE: if features are commented out, they were experimented with but deemed # as either unuseful or overfitting # loop through words in the attraction for word in attraction: features[('attraction_word',word)]=1 titleCount=0 # loop through words in title for word in title: # check not in stopwords if word.lower() not in stopwords: features[('title_word',word.lower())]=1 if word.lower() in labels: features[('title_label',word.lower())]=1 if titleCount==0: features[('first_word',word)]=1 titleCount+=1 counter = 0 previousWord = '' # lop through words in body text for word in bodyText: # check not punctuation or stopword if word.lower() not in stopwords and word not in string.punctuation: features[('body_word',word)]=1 features[('body_word_stemmed',stem(word).lower())]=1 # unuseful features are commented out here # if postags[counter][1]=='JJ': # features[('body_jj',word)]=1 # if postags[counter][1]=='NN': # features[('body_nn',word)]=1 if word.lower() in labels: features[('body_label',word.lower())]=1 # if counter != 0: # features[('bigram',previousWord.lower()+word.lower())]=1 previousWord=word # for key,value in gazeteer.iteritems(): # if word.lower() in value: # features[(key+'gazeteer',word.lower())]=1 counter+=1 features[('length_review',len(bodyText))]=1 return features
def abstract_words(abstract): """Get each word in the abstract, making chars lower case, removing non-alphabetic chars and spaces.""" allowable_chars = unicode("abcdefghijklmnopqrstuvwxyz ") trimmed = "".join([char for char in abstract.lower() if char in allowable_chars]) words = [] for word in trimmed.split(" "): if len(word) <= 1 or word in COMMON_ENGLISH: continue try: stemmed_word = stem(word) except ValueError: stemmed_word = word words.append(stemmed_word) return words
def processQuery(self,doc_list): """ This is the main function which calculates scores for all documents and ranks them in decreasing order of score """ #Tokenize query #Add punctuation handling functionality if '.' in self.query: self.query.replace('.',' ') if "'" in self.query: self.query("'","") if "`" in self.query: self.query.replace("'","") if '''"''' in self.query: self.query.replace('''"''',"") self.query=self.query.decode("utf8") words=nltk.word_tokenize(self.query.lower()) score={} score=defaultdict(lambda:0,score) for word in words: #Stem words to bring into base form word=porter2.stem(word) #print word #Fetching Document Frequency of the query word. cursor=self.db.cursor() sql="SELECT FREQ from DOC_FREQ where WORD='%s'"%word cursor.execute(sql) if cursor.rowcount>0: dft=cursor.fetchone()[0]; #print dft idf=self.calculate_IDF(dft); else: idf=0 for doc in doc_list: tf=self.get_TF(word,doc) #print "tf="+str(tf) #print "idf="+str(idf) tw=self.getTermWeight(tf,doc) #print tw score[doc]=score[doc]+idf*tw #print "score-%d"%score[doc] #We have now calculated the score of documents with respect to our query. result=[] for doc,score in sorted(score.iteritems(),key=lambda (k,v):(v,k)): result.append((doc_list[doc],score)) result.reverse() return self.final_result(result)
def get_matrix(file_name): # 辞書オブジェクトallwordsを空で初期化 allwords = {} # リストオブジェクトarticlewordsとtitkesを空で初期化 articlewords = [] titles = [] art_no = 0 # 記事のファイルをオープン f = open(file_name) # 最初の行をlineに読み込む line = f.readline() # ファイル内の各行を1行ずつ最後の行まで処理する while line: # 記事の単語辞書を空で追加(後でart_noでアクセスできるように初期化) articlewords.append({}) # "|"で分けてリストにし,最初の要素をタイトルリストに追加 titles.append(re.split('\|', line)[0]) # 1文字以上の任意の非英数文字列で分割してリストにし,リスト要素のうち4文字以上の文字列のリストをセット words = [s.lower() for s in re.split('\W+', line) if len(s) > 3] # wordsリスト内の各文字列を処理 for word in words: # 語幹を取り出す word_stem = stem(word) # word_stemがキーとして辞書allwordsに登録されていなければ要素'word_stem:0'をallwordsに追加 ###### ここにコーディング ###### allwords.setdefault(word_stem, 0) # 辞書allwordsに登録されているキーword_stemの値を1増やす allwords[word_stem] = allwords[word_stem] + 1 ###### ここにコーディング ###### # word_stemがキーとして処理中の記事の単語辞書articlewords[art_no]に登録されていなければ'word_stem:0'をarticlewords[art_no]に追加 articlewords[art_no].setdefault(word_stem, 0) # 処理中の記事の単語辞書articlewords[art_no]に登録されているキーword_stemの値を1増やす articlewords[art_no][word_stem] += 1 # 次の行をlineに読み込む line = f.readline() art_no += 1 f.close # 出現単語の内,2回以上出現したもののみwordvecに入れる wordlist = [] for w, c in allwords.items(): if c > 1: wordlist.append(w) # タイトルのリスト,単語のリスト,各記事に単語は何回出現したかの行列を返す. # 記事art_noに出現する単語はarticlewords[art_no]に格納されているが,その内,wordvecに格納されているもののみ使って行列を作る return titles, wordlist, matrix([[(word in f and f[word] or 0) for word in wordlist] for f in articlewords])
def _sumStem(self): k = 1.0 weighted_sum = 0.0 num_docs = len(self._dWords) doc_len_avg = 0.0 for doc in self._doc_stem_words: doc_len_avg += len(doc)-1.0 # Subtract 1 to doc_len_avg = doc_len_avg / len(self._dWords) # The averaging step # The main tf.idf loop # For each query we calculate the tf.idf for each document and add this to the list. # This means query_weights will be len(documents) and weighted sums will be len(queries) # and the overall size will be len(queries)*len(documents) ? for query in self._qWords: query_weights = [] print ">>> Processing query " + query[0] for doc in self._doc_stem_words: doc_len = len(doc)-1.0 weighted_sum = 0.0 for word in query[1:]: word = porter2.stem(word) # Inline stemming of query words tf_wq = query.count(word) tf_wd = doc.count(word) df_w = 0.0 tf_idf = 0.0 # No point calculating the tf.idf if we know it's going to be zero if (tf_wd != 0): df_w = self._numStemDocsContain(word) # This step takes ages. :( tf_idf = (tf_wq*(tf_wd / (tf_wd + ((k*doc_len)/doc_len_avg) ))*( math.log(num_docs/df_w) )) weighted_sum += tf_idf # Only care about things with a weight above 0 if (weighted_sum != 0): query_weights.append((query[0], doc[0], str(weighted_sum))) self._weighted_sums.append(query_weights) if (self._performWrite): self._writeOut() print ">>> Writings tf.idf results to tfidf.top"
def azureml_main(df_message=None, df_features=None): # Merging the header and the message body heading = df_message.Heading.iloc[0] if not heading: heading = '' content = df_message.Content.iloc[0] org_message = heading.lower() + " " + content.lower() feature_vec = list(df_features.iloc[0, :]) # PREPROCESSING #-------------------- # Replace various patterns message = org_message for key, pattern_list in patterns: for p in pattern_list: message = re.sub(p, key, message) message.replace('kjempe', '') message.replace('mega', '') message.replace('super', '') message = re.split('\W+', message) message = [w for w in message if w not in words_to_ignore] message = [stem(w) for w in message] # FIND FEATURES IN MESSAGE #----------------------------- # For each message, count the features which occurs in that particular message message_features = [0] * len(feature_vec) for idx, feat in enumerate(feature_vec): message_features[idx] = message.count(feat) d = {'0': message_features} df_output = pd.DataFrame().from_dict(d, orient='index') df_output.columns = df_features.columns # Return value must be of a sequence of pandas.DataFrame return df_output,
def stemWords(data): jstuff = [] for word in data: suffix = '' root = '' if word.lower() not in stopList: if '*' not in word: root = stem(word) print root for i,s in enumerate(difflib.ndiff(root, word)): if s[0]==' ': continue #elif s[0]=='-': # print(u'Delete "{}" from position {}'.format(s[-1],i)) elif s[0]=='+': print(u'Add "{}" to position {}'.format(s[-1],i)) suffix += s[-1] jstuff.append({'word':root, 'variance':suffix}) return jstuff
def word_stem_stop_word(reply_text,num): stopwordsfile = open('stopwords.txt') stopwords = stopwordsfile.read().split('\r\n') nltk_word = nltk.word_tokenize(reply_text) nltk_word = nltk.pos_tag(nltk_word) reply = [] proper_nouns = Set([]) for word, tag in nltk_word: if str(word.lower()) not in stopwords: if (tag == 'NNP' or tag == 'NNPS'): proper_nouns.add(word.lower()) else: word = correct.correct(word) word = porter.stem(word) word = correct.correct(word) reply.append(word.lower()) #print reply #print proper_nouns reply = ' '.join(reply) if num: return reply,proper_nouns else: return reply
def applyStemming(stopWordsRemovedAbstract): stemmmingAppliedAbstract = [] for word in stopWordsRemovedAbstract: stemmmingAppliedAbstract.append(stem(word)) return stemmmingAppliedAbstract
def thirty(): with open("./medline.txt.send.tok") as f: for token in f: print(porter2.stem(token.strip()))
def stem(words): return {porter2.stem(word) for word in words}
text2_tokens = wgetAndTokenize(text2_url) # make RDD with list of words along with their position in the original text (so we can find context later) text1_tokensRDD = sc.parallelize(text1_tokens).zipWithIndex() text2_tokensRDD = sc.parallelize(text2_tokens).zipWithIndex() #print text1_tokensRDD.take(5) # get rid of sequences of non-word chars, keep remaining strings with something in them, and not in stop list: text1_tokensRDD = text1_tokensRDD.map(lambda p: (re.sub('\W+', '', p[0]).lower( ), p[1])).filter(lambda p: len(p[0]) > 0 and not p[0] in stop_words) print text1_tokensRDD.take(5) text2_tokensRDD = text2_tokensRDD.map(lambda p: (re.sub('\W+', '', p[0]).lower( ), p[1])).filter(lambda p: len(p[0]) > 0 and not p[0] in stop_words) # stem the words using imported stem function (chosen arbitrarily) text1_stemmedRDD = text1_tokensRDD.map(lambda p: (stem(p[0]), p[1])) print text1_stemmedRDD.take(5) text2_stemmedRDD = text2_tokensRDD.map(lambda p: (stem(p[0]), p[1])) t1raw = text1_stemmedRDD.toDF(['entry', 'locus']) t1raw.show() t2raw = text2_stemmedRDD.toDF(['entry', 'locus']) t1raw.registerTempTable("t1raw") t2raw.registerTempTable("t2raw") bg1 = sqlContext.sql( "select a.entry a1, b.entry b1, a.locus, b.locus from t1raw a cross join t1raw b where a.entry < b.entry and a.locus - b.locus < 7 and b.locus - a.locus < 7" ) bg1.show(4)
def stemming(self, index, text, data): from porter2 import stem if len(text) <= 129: text += ' twentyfivesentence' elif len(text) <= 181: text += ' fiftysentence' elif len(text) <= 243: text += ' seventyfivesentence' else: text += ' largesentence' for i in xrange(text.count('%')): text += ' uniqpercent' for i in xrange(text.count('@')): text += ' uniqatmark' for i in xrange(text.count(',')): text += ' uniqcomma' for i in xrange(text.count("'")): text += ' uniqapostrophe' for i in xrange(text.count('...')): text += ' uniqellipses' for i in xrange(text.count(':')): text += ' uniqcolon' for i in xrange(text.count('!')): text += ' uniqexclamation' if '(' or ')' in text: text += ' uniqparentheses' for i in xrange(text.count('?')): text += ' uniqquestion' for i in xrange(text.count('"')): text += ' uniqquote' for i in xrange(text.count('#')): text += ' uniqhashtag' for i in xrange(text.count('0')): text += ' uniqzero' for i in xrange(text.count('1')): text += ' one' for i in xrange(text.count('2')): text += ' two' for i in xrange(text.count('3')): text += ' three' for i in xrange(text.count('4')): text += ' four' for i in xrange(text.count('5')): text += ' five' for i in xrange(text.count('6')): text += ' six' for i in xrange(text.count('7')): text += ' seven' for i in xrange(text.count('8')): text += ' eight' for i in xrange(text.count('9')): text += ' nine' if '/' in text: text = text.replace('/', ' ') text += ' forwardslash' upper = sum(1 for i in text if i.isupper()) iterate = 1 while upper - iterate > 0: text += ' formalword' iterate += 1 import wikipedia import re if index % 100 == 0: p = 100*index/len(data) print "Wiki part, percent: %d" % p searchForEpi = data[index]['trope'] searchForEpi = re.sub(r'([A-Z])', r' \1', searchForEpi) searchForTitle = data[index]['page'] searchForTitle = re.sub(r'([A-Z])', r' \1', searchForTitle) search = searchForEpi + ' ' + searchForTitle try: summary = wikipedia.summary(search, sentences=1) newtext = str(text) + ' ' + str(summary) except: summary = None word_list = text.split() for i in xrange(len(word_list)): word_list[i] = stem(word_list[i]) join_list = ' '.join(word_list) join_list += ' ' + data[index]['page'] return join_list
def getTextFeatures2(dirName): engStopWords = stopwords.words('english') mystopwords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"] mergedStopWords = mergedlist = list(set(engStopWords + mystopwords)) print len(engStopWords), len(mystopwords), len(mergedStopWords) # initilizations: ignoreCase = True iDoc = 0 tf = [] idf = [] tfidf = [] nWords = [] words = [] allFiles = [] for subdir, dirs, files in os.walk(dirName): files.sort() for file in files: # for each file in the given directory: file = dirName + file # update the list of files analyzed: allFiles.append(file) curWords = [] # print repr(iDoc) + " of " + repr(len(files)) + file nWords.append(0) # add count of total words in the current document for line in open(file): # for each file in the current document: if ignoreCase: line = line.lower() tokenizer = RegexpTokenizer('[\d\.]+|\w+|\$[\d\.]+') # initialize tokenizer tokens = tokenizer.tokenize(line) for word in tokens: # for each word: if len(word) > 2 and not word.isdigit() and word not in mergedStopWords: # if the word is not in the list of stop words and its length is at least 3 # word = WordNetLemmatizer().lemmatize(word, 'v') # stemming # word = PorterStemmer().stem_word((word)) word = stem(word) if word.isdigit(): continue nWords[iDoc] += 1; # update the number of total words in the document curWords.append(word) if word not in words: # if the current word is not in the GLOBAL list of words (bag of words): tf.append([0]*len(files)) # add zeros to the tf matrix tfidf.append([0]*len(files)) # add zeros to the tf-idf matrix tf[-1][iDoc] += 1 # increase by 1 the tf value of the current element for the current word words.append(word) # add the word to the bag of words #print len(words) idf.append(0) # add a zero to the idf array else: idxWord = words.index(word) # find the index of the current word in the bag of words tf[idxWord][iDoc] += 1 # update the term frequency matrix nGrams2 = ngrams(curWords, 2) for ngram, count in nGrams2.iteritems(): if count>1: if ngram not in words: # if the current word is not in the GLOBAL list of words (bag of words): tf.append([0]*len(files)) # add zeros to the tf matrix tfidf.append([0]*len(files)) # add zeros to the tf-idf matrix tf[-1][iDoc] += count # increase by 1 the tf value of the current element for the current word words.append(ngram) # add the word to the bag of words #print "NGRAM: " + str(len(words)) idf.append(0) # add a zero to the idf array else: idxWord = words.index(ngram) # find the index of the current word in the bag of words tf[idxWord][iDoc] += 1 # update the term frequency matrix iDoc = iDoc + 1 # current number of processed documents: # CLEAR NOT FREQUENT VALUES: # if iDoc % 10 == 0: # print "DOC " + str(iDoc) + " of " + str(len(files)) + " - - - # WORDS " + str(len(words)) # BUG!!!!!!!!!!!!!!!!!!!! if (iDoc % 500 == 0) | (iDoc == len(files)): toRemove = [] print " CLEANING: DOC " + str(iDoc) + " of " + str(len(files)) + " - - - - - Words before cleaning" , len(words), for w in range(len(words)): countDocs = sum(x > 0 for x in tf[w]) if countDocs < 3: toRemove.append(w) for rindex in sorted(toRemove, reverse=True): del tf[rindex] del tfidf[rindex] del idf[rindex] del words[rindex] print " Words after cleaning" , len(words) numOfDocs = float(iDoc); # total number of processed docs # post process: compute the final tf array and compute the idf counter: for i in range(len(tf)): # for each word for j in range(len(tf[i])): # for each document if tf[i][j] > 0: # if the currect word has appearred (at least once) in the current document: idf[i] += 1.0 # update the idf counter if (nWords[j] > 0): tf[i][j] = tf[i][j] / float(nWords[j]) # normalize the tf value else: tf[i][j] = 0.0 T1 = 1.0 for i in range(len(idf)): if idf[i]<T1: idf.pop(i) tf.pop(i) words.pop(i) tfidf.pop(i) dFreq = [] # compute the final tf value for i in range(len(idf)): dFreq.append(idf[i] / numOfDocs) idf[i] = 1.0 + math.log10(numOfDocs/idf[i]) # compute the tf-idf value: for i in range(len(tf)): # for each word for j in range(len(tf[i])): # for each document tfidf[i][j] = idf[i] * tf[i][j] return (allFiles, words, tf, idf, tfidf, dFreq)
def stemming(self, index, text, data): from porter2 import stem if len(text) <= 129: text += ' twentyfivesentence' elif len(text) <= 181: text += ' fiftysentence' elif len(text) <= 243: text += ' seventyfivesentence' else: text += ' largesentence' for i in xrange(text.count('%')): text += ' uniqpercent' for i in xrange(text.count('@')): text += ' uniqatmark' for i in xrange(text.count(',')): text += ' uniqcomma' for i in xrange(text.count("'")): text += ' uniqapostrophe' for i in xrange(text.count('...')): text += ' uniqellipses' for i in xrange(text.count(':')): text += ' uniqcolon' for i in xrange(text.count('!')): text += ' uniqexclamation' if '(' or ')' in text: text += ' uniqparentheses' for i in xrange(text.count('?')): text += ' uniqquestion' for i in xrange(text.count('"')): text += ' uniqquote' for i in xrange(text.count('#')): text += ' uniqhashtag' for i in xrange(text.count('0')): text += ' uniqzero' for i in xrange(text.count('1')): text += ' one' for i in xrange(text.count('2')): text += ' two' for i in xrange(text.count('3')): text += ' three' for i in xrange(text.count('4')): text += ' four' for i in xrange(text.count('5')): text += ' five' for i in xrange(text.count('6')): text += ' six' for i in xrange(text.count('7')): text += ' seven' for i in xrange(text.count('8')): text += ' eight' for i in xrange(text.count('9')): text += ' nine' if '/' in text: text = text.replace('/', ' ') text += ' forwardslash' upper = sum(1 for i in text if i.isupper()) iterate = 1 while upper - iterate > 0: text += ' formalword' iterate += 1 import wikipedia import re if index % 100 == 0: p = 100 * index / len(data) print "Wiki part, percent: %d" % p searchForEpi = data[index]['trope'] searchForEpi = re.sub(r'([A-Z])', r' \1', searchForEpi) searchForTitle = data[index]['page'] searchForTitle = re.sub(r'([A-Z])', r' \1', searchForTitle) search = searchForEpi + ' ' + searchForTitle try: summary = wikipedia.summary(search, sentences=1) newtext = str(text) + ' ' + str(summary) except: summary = None word_list = text.split() for i in xrange(len(word_list)): word_list[i] = stem(word_list[i]) join_list = ' '.join(word_list) join_list += ' ' + data[index]['page'] return join_list
def my_stem(word): if word == word.upper() or (len(word) >= 2 and word[0:2] == "__"): return word else: return stem(word.lower())
def check(self, candidate): if len(candidate) == 0: return False normed = stem(candidate.lower()) return normed in self.stems
def BOW(db,corpus,cwd): """ This function creates the bag-of-words representation for all the corpus documents and creates a database for all words. """ TOTAL_WORDS=0 doc_freq=dict() DOC_LIST=dict() doc_dic=dict() no_of_doc=0 words=[] #Create the DOCF table db.create_table_docf(); # Execute shell script which get document list from corpus folder command="sh getdoclist.sh "+corpus+" "+cwd ret=os.system(command) if ret!=0: print "Error creating document list\n." exit(1) stop=string.punctuation #punctuation removed #Open file containing name of documents in corpus fin=open("doclist","r") #Open each document at a time and construct Bag of Words for each document for line in fin: openfile=corpus+"/"+line.strip() fdoc=open(openfile,"r") no_of_doc+=1 #Read opened doc line by line for sentence in fdoc: #convert into lower case and tokenize using nltk #Add functionality to handle punctuation '.' and '' if '.' in sentence: sentence.replace('.',' ') if "'" in sentence: sentence.replace("'","") if "`" in sentence: sentence.replace("'","") if '''"''' in sentence: sentence.replace('''"''',"") sentence=sentence.decode("utf8") sentence=nltk.word_tokenize(sentence.lower()) #if wrd from sentence list , not in stop 'list' then add it to a list of words for doc for word in sentence: if word not in stop: #Add suitable stemmer here-Porter word=porter2.stem(word) doc_dic[word]=doc_dic.get(word,0)+1 TOTAL_WORDS=TOTAL_WORDS+1 #for x in words: # print x #Create table corresponding to the Doc. Splitting 'doc_name.txt' and creating a table named docname docname="d"+line.split('.')[0] DOC_LIST[docname]=line.strip(); db.create_table_doc(docname) #All words of the doc are added to words. Now add them to doc db #Also updates Doc Freq for the words temp=[(word,count) for word,count in doc_dic.items()] for word,count in temp: doc_freq[word]=doc_freq.get(word,0)+1 db.insert_into_doc(docname,word,count) doc_dic.clear() #Close file doc_list fin.close() #All Documents are processed and their corresponding tables made. #Also doc_freq now contains the list of words and the number of documents in which they occur. #Set no. of corpus documents db.set_no_of_doc(no_of_doc) db.set_total_words(TOTAL_WORDS) #Add doc_freq to doc_freq table temp=[(word,doc_count) for word,doc_count in doc_freq.items()] for word,doc_count in temp: db.add_to_doc_freq(word,doc_count) return DOC_LIST
def stem_phrase(phrase): words = phrase.split() for i in xrange(0, len(words)): words[i] = porter2.stem(words[i]) return ' '.join(words)
def BOW(db, corpus, cwd): """ This function creates the bag-of-words representation for all the corpus documents and creates a database for all words. """ TOTAL_WORDS = 0 doc_freq = dict() DOC_LIST = dict() doc_dic = dict() no_of_doc = 0 words = [] #Create the DOCF table db.create_table_docf() # Execute shell script which get document list from corpus folder command = "sh getdoclist.sh " + corpus + " " + cwd ret = os.system(command) if ret != 0: print "Error creating document list\n." exit(1) stop = string.punctuation #punctuation removed #Open file containing name of documents in corpus fin = open("doclist", "r") #Open each document at a time and construct Bag of Words for each document for line in fin: openfile = corpus + "/" + line.strip() fdoc = open(openfile, "r") no_of_doc += 1 #Read opened doc line by line for sentence in fdoc: #convert into lower case and tokenize using nltk #Add functionality to handle punctuation '.' and '' if '.' in sentence: sentence.replace('.', ' ') if "'" in sentence: sentence.replace("'", "") if "`" in sentence: sentence.replace("'", "") if '''"''' in sentence: sentence.replace('''"''', "") sentence = sentence.decode("utf8") sentence = nltk.word_tokenize(sentence.lower()) #if wrd from sentence list , not in stop 'list' then add it to a list of words for doc for word in sentence: if word not in stop: #Add suitable stemmer here-Porter word = porter2.stem(word) doc_dic[word] = doc_dic.get(word, 0) + 1 TOTAL_WORDS = TOTAL_WORDS + 1 #for x in words: # print x #Create table corresponding to the Doc. Splitting 'doc_name.txt' and creating a table named docname docname = "d" + line.split('.')[0] DOC_LIST[docname] = line.strip() db.create_table_doc(docname) #All words of the doc are added to words. Now add them to doc db #Also updates Doc Freq for the words temp = [(word, count) for word, count in doc_dic.items()] for word, count in temp: doc_freq[word] = doc_freq.get(word, 0) + 1 db.insert_into_doc(docname, word, count) doc_dic.clear() #Close file doc_list fin.close() #All Documents are processed and their corresponding tables made. #Also doc_freq now contains the list of words and the number of documents in which they occur. #Set no. of corpus documents db.set_no_of_doc(no_of_doc) db.set_total_words(TOTAL_WORDS) #Add doc_freq to doc_freq table temp = [(word, doc_count) for word, doc_count in doc_freq.items()] for word, doc_count in temp: db.add_to_doc_freq(word, doc_count) return DOC_LIST
def stem_leaf(self): for leaf in self.leaves: leaf.stem = stem(leaf.value)
def __stem(self, tok): tok = re.sub(r"[\.|,|:|\?|\"|\'|;|!]+$", "", tok) tok = porter2.stem(tok) return tok
def stemWords(word): return porter2.stem(word).lower()
answer = answer.lower() word_list = [] # remove all punctuation for m in word.finditer(answer): # we are on word w w = m.group(0) # skip stopwords if w in STOPWORDS: continue # skip two letter words if len(w)<3: continue if dostem: w_stem = stem(w) else: w_stem = w word_list.append(w_stem) answer2 = " ".join(word_list) low_file.write( answer2 + "\n" ) ids_file.write( user_id+","+question_id + "\n" ) counter += 1 print "processed ", counter, " entries"
def getTextFeatures(dirName): engStopWords = stopwords.words('english') # initilizations: ignoreCase = True iDoc = 0 tf = [] idf = [] tfidf = [] nWords = [] words = [] allFiles = [] for subdir, dirs, files in os.walk(dirName): files.sort() for file in files: # for each file in the given directory: file = dirName + file # update the list of files analyzed: allFiles.append(file) # print repr(iDoc) + " of " + repr(len(files)) + file nWords.append(0) # add count of total words in the current document for line in open(file): # for each file in the current document: if ignoreCase: line = line.lower() tokenizer = RegexpTokenizer('[\d\.]+|\w+|\$[\d\.]+') # initialize tokenizer tokens = tokenizer.tokenize(line) for word in tokens: # for each word: if len(word) > 2 and word not in engStopWords: # if the word is not in the list of stop words and its length is at least 3 #stemmer = SnowballStemmer("german") # TODO: other languages (language detection). Use SnowballStemmer.languages to see list of languages #word = stemmer.stem(word) #word = PorterStemmer().stem_word((word)) word = stem(word) # word = WordNetLemmatizer().lemmatize(word, 'v') # stemming nWords[iDoc] += 1; # update the number of total words in the document if word not in words: # if the current word is not in the GLOBAL list of words (bag of words): tf.append([0]*len(files)) # add zeros to the tf matrix tfidf.append([0]*len(files)) # add zeros to the tf-idf matrix tf[-1][iDoc] += 1 # increase by 1 the tf value of the current element for the current word words.append(word) # add the word to the bag of words idf.append(0) # add a zero to the idf array else: idxWord = words.index(word) # find the index of the current word in the bag of words tf[idxWord][iDoc] += 1 # update the term frequency matrix iDoc = iDoc + 1 # current number of processed documents: numOfDocs = float(iDoc); # total number of processed docs # post process: compute the final tf array and compute the idf counter: for i in range(len(tf)): # for each word for j in range(len(tf[i])): # for each document if tf[i][j] > 0: # if the currect word has appearred (at least once) in the current document: idf[i] += 1.0 # update the idf counter if (nWords[j] > 0): tf[i][j] = tf[i][j] / float(nWords[j]) # normalize the tf value else: tf[i][j] = 0.0 T1 = 1.0 idfTemp = [] tfTemp = [] wordsTemp = [] tfidfTemp = [] for i in range(len(idf)): if idf[i]>T1: idfTemp.append(idf[i]) tfTemp.append(tf[i]) wordsTemp.append(words[i]) tfidfTemp.append(tfidf[i]) idf = list(idfTemp) tf = list(tfTemp) words = list(wordsTemp) tfidf = list(tfidfTemp) dFreq = [] # compute the final tf value for i in range(len(idf)): dFreq.append(idf[i] / numOfDocs) idf[i] = 1.0 + math.log10(numOfDocs/idf[i]) # compute the tf-idf value: for i in range(len(tf)): # for each word for j in range(len(tf[i])): # for each document tfidf[i][j] = idf[i] * tf[i][j] return (allFiles,words, tf, idf, tfidf, dFreq)
surname, givenname = '', '' for data in child.iter('surname'): surname = data.text for data in child.iter('given-names'): givenname = data.text authorToNumber[string.join([givenname, surname], ' ')].append( xmlNumber) for child in itertools.chain( xmldata.getroot().iter('p'), xmldata.getroot().iter('title')): #does the text if child.text is not None: lowerText = string.lower(child.text).encode('ascii', 'ignore') mangledText = re.sub(r'\W+', ' ', lowerText) # removes non alphabet words = string.split(mangledText) for word in words: stemmed = porter2.stem(word) # stemming removes -ing -ed etc if not isNumber(stemmed): # if converts to a float, don't do it numberToWords.update([stemmed]) numberToWordCount[xmlNumber] += len(words) except xml.etree.ElementTree.ParseError: # article not valid xml pass outNTW.write(str(xmlNumber) + ' ' + str(len(numberToWords)) + ' ') for oneWord in numberToWords: outNTW.write(oneWord + ' ') outNTW.write('\n') outNTW.close() #just write these to plaintext files. janky but lets others use the data #easily without having to unpack pickled files or whatever other solution #i could choose from. outNTT = open('number.to.type.txt', 'w') for outdata in numberToType.iteritems():
def getTextFeatures2_notfidf(dirName): engStopWords = stopwords.words('english') mystopwords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"] mergedStopWords = mergedlist = list(set(engStopWords + mystopwords)) # initilizations: ignoreCase = True iDoc = 0 nWords = [] words = [] allFiles = [] dFreq = [] for subdir, dirs, files in os.walk(dirName): files.sort() for file in files: # for each file in the given directory: #if len(allFiles)>100: # break; file = dirName + file # update the list of files analyzed: allFiles.append(file) curWords = [] # print repr(iDoc) + " of " + repr(len(files)) + file nWords.append(0) # add count of total words in the current document statinfo = os.stat(file) Size = statinfo.st_size if Size<500: continue if Size>550: continue for line in open(file): # for each file in the current document: if ignoreCase: line = line.lower() if Size<500: print Size if Size>3000: print Size tokenizer = RegexpTokenizer('[\d\.]+|\w+|\$[\d\.]+') # initialize tokenizer tokens = tokenizer.tokenize(line) for word in tokens: # for each word: if len(word) > 2 and not word.isdigit() and word not in mergedStopWords: # if the word is not in the list of stop words and its length is at least 3 # word = WordNetLemmatizer().lemmatize(word, 'v') # stemming # word = PorterStemmer().stem_word((word)) word = stem(word) if word.isdigit(): continue nWords[iDoc] += 1; # update the number of total words in the document if (word not in words): # if the current word is not in the GLOBAL list of words (bag of words): words.append(word) # add the word to the bag of words dFreq.append(1.0) # add 1 to the dFreq array else: if (word not in curWords): # current word is not in the list of ALREADY READ words of the current doc idxWord = words.index(word) # find the index of the current word in the bag of words dFreq[idxWord] += 1.0 # update the dFreq matrix curWords.append(word) nGrams2 = ngrams(curWords, 2) for ngram, count in nGrams2.iteritems(): if count>5: if ngram not in words: # if the current word is not in the GLOBAL list of words (bag of words): words.append(ngram) # add the word to the bag of words #print "NGRAM: " + str(len(words)) dFreq.append(1.0) # add '1.0' to the dFreq array else: idxWord = words.index(ngram) # find the index of the current word in the bag of words dFreq[idxWord] += 1.0 # update the freq iDoc = iDoc + 1 # current number of processed documents: # print words, dFreq # raw_input("Press ENTER to exit") numOfDocs = float(iDoc); # total number of processed docs dFreq2 = [] words2 = [] for i,d in enumerate(dFreq): if d>1: dFreq2.append(d) words2.append(words[i]) dFreq = dFreq2 words = words2 # compute the final df value for i in range(len(dFreq)): dFreq[i] = (dFreq[i] / numOfDocs) print len(dFreq) return (allFiles, words, dFreq)
# make RDD with list of words along with their position in the original text (so we can find context later) text1_tokensRDD = sc.parallelize(text1_tokens).zipWithIndex() text2_tokensRDD = sc.parallelize(text2_tokens).zipWithIndex() #print text1_tokensRDD.take(5) # define a list of stop words (chosen fairly arbitrarily) stop_words = ['a', 'i', 'an', 'as', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'aint', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'arent', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'cmon', 'cs', 'came', 'can', 'cant', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', 'course', 'currently', 'definitely', 'described', 'despite', 'did', 'didnt', 'different', 'do', 'does', 'doesnt', 'doing', 'dont', 'done', 'down', 'downwards', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from', 'further', 'furthermore', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', 'hadnt', 'happens', 'hardly', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hes', 'hello', 'help', 'hence', 'her', 'here', 'heres', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'id', 'ill', 'im', 'ive', 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', 'isnt', 'it', 'itd', 'itll', 'its', 'its', 'itself', 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', 'shouldnt', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'ts', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'thats', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'theres', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'well', 'were', 'weve', 'welcome', 'well', 'went', 'were', 'werent', 'what', 'whats', 'whatever', 'when', 'whence', 'whenever', 'where', 'wheres', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whos', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'wonder', 'would', 'wouldnt', 'yes', 'yet', 'you', 'youd', 'youll', 'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves', 'zero'] # get rid of sequences of non-word chars, keep remaining strings with something in them, and not in stop list: text1_tokensRDD = text1_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) #print text1_tokensRDD.take(5) text2_tokensRDD = text2_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) # stem the words using imported stem function (chosen arbitrarily) text1_stemmedRDD = text1_tokensRDD.map(lambda p:(stem(p[0]), p[1])) #print text1_stemmedRDD.take(5) text2_stemmedRDD = text2_tokensRDD.map(lambda p:(stem(p[0]), p[1])) # for each word, get the list of loci: text1_concRDD = text1_stemmedRDD.groupByKey() #print text1_concRDD.take(5) text2_concRDD = text2_stemmedRDD.groupByKey() # find every pair of words (brute force) text1_bigrams = text1_concRDD.cartesian(text1_concRDD) #print text1_bigrams.first() text2_bigrams = text2_concRDD.cartesian(text2_concRDD) # eliminate transposed pairs, and dupes -- keep ("a","b"); not ("b", "a") or ("a", "a") etc text1_bigrams = text1_bigrams.filter(lambda p:p[0][0] < p[1][0])
def classifyFile(fileName, dictionaries, dictionariesWeight, dictionariesNames, numOfResultsReturned, PLOT_FIGURE): engStopWords = stopwords.words('english') ignoreCase = True nClasses = len(dictionaries) Ps = [0.0] * nClasses nWords = 0; curWords = [] curFreqs = [] totalWords = 0 tokenizer = RegexpTokenizer('[\d\.]+|\w+|\$[\d\.]+') # initialize tokenizer # STEP A: GENERATE LIST OF WORDS (AFTER STEMMING AND STOPWORD REMOVAL): for line in open(fileName): # for each file in the current document: if ignoreCase: line = line.lower() tokens = tokenizer.tokenize(line) for word in tokens: # for each word: if len(word) > 2 and word not in engStopWords: # if the word is not in the list of stop words and its length is at least 3 #word = WordNetLemmatizer().lemmatize(word, 'v') # stemming # word = PorterStemmer().stem_word((word)) word = stem(word) totalWords += 1.0 if word not in curWords: curWords.append(word) curFreqs.append(1.0) else: curFreqs[curWords.index(word)] += 1.0 normalizeFactor = (totalWords / 15.0) # STEP B: PROPABILITY PRODUCT COMPUTATION (BASED ON SINGLE WORDS) for iword, word in enumerate(curWords): FOUND_word = 0; for d in range(len(dictionaries)): dic = dictionaries[d] if word in dic: idxWord = dic.index(word) toMulti = 1.0 + dictionariesWeight[d][idxWord] #Ps[d] *= (toMulti + (1.0 - (1.0/nClasses)))**(curFreqs[iword]/normalizeFactor) #if toMulti>20: Ps[d] += math.log(toMulti) FOUND_word = 1; else: #Ps[d] *= (1.0 - (1.0/nClasses))**(1.0/normalizeFactor) Ps[d] += (0.0) if (FOUND_word==1): #print word nWords += 1 print Ps # STEP C: PROBABILITY PRODUCT COMPUATION (BASED ON N-GRAMS): nGrams2 = ngrams(curWords, 2) for ngram, count in nGrams2.iteritems(): FOUND_word = 0; for d in range(len(dictionaries)): dic = dictionaries[d] if ngram in dic: idxWord = dic.index(ngram) toMulti = 1.0 + dictionariesWeight[d][idxWord] Ps[d] += math.log(toMulti) # Ps[d] *= (toMulti + (1.0 - (1.0/nClasses)))**(1.0/normalizeFactor) # print nGram, toMulti FOUND_word = 1; else: #Ps[d] *= (1.0 - (1.0/nClasses))**(1.0/normalizeFactor) Ps[d] += 0.0 if (FOUND_word==1): #print word nWords += 1 print Ps for d in range(nClasses): if nWords>0: Ps[d] /= len(curWords) # Ps[d] /= (len(dictionariesWeight[d])+0.00000000001) # Ps[d] /= nWords # Ps[d] /= sum(dictionariesWeight[d]) else: Ps[d] = 0 MEANPs = mean(Ps) MAX = max(Ps) finalLabels = [] finalLabelsPs = [] IndecesSorted = [i[0] for i in sorted(enumerate(Ps), key=lambda x:x[1], reverse=True)] for i in range(numOfResultsReturned): finalLabels.append(dictionariesNames[IndecesSorted[i]]) #finalLabelsPs.append(Ps[IndecesSorted[i]] / MAX) finalLabelsPs.append(Ps[IndecesSorted[i]]) # for i in range(len(Ps)): # if Ps[i] > 2.0 * MEANPs: # # print(dictionariesNames[i] + "\t\t\t\t" + str(Ps[i])) # finalLabels.append(dictionariesNames[i]) # finalLabelsPs.append(Ps[i]) if (PLOT_FIGURE==1): fig = plt.figure() plt.bar(arange(1,numOfResultsReturned+1)-0.5, array(finalLabelsPs)) for i in range(numOfResultsReturned): plt.text(i+1, 0, finalLabels[i], rotation=90, size=10,horizontalalignment='center',verticalalignment='bottom') plt.xticks(range(numOfResultsReturned), [], size='small') plt.show(); #print nClasses #plt.savefig('new.png', dpi = 500); return (finalLabels, finalLabelsPs)
def clean_word(w): return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))
def processWikiText(ipfile,file): f=open(ipfile,"r") pagetrie=dict() lines=f.readlines() f.close() lenlines=lines.__len__() i=0 count=0 title='' docId='' text='' infoflag=0 catflag=0 txtflag=0 outflag=0 tflag=0 start=time.time() while i<lenlines: line=lines[i].lstrip(" ") line=line.rstrip("\n") if line=="<page>": i+=1 line=lines[i].lstrip(" ") line=line.rstrip("\n") if line[:7]=="<title>": title=line[7:-8] i+=2 line=lines[i].lstrip(" ") line=line.rstrip("\n") if line[:4]=="<id>": docId=line[4:-5] docId=base.decimaltobase62(int(docId)) term='' tlength=title.__len__() for j in xrange(tlength): lower=title[j].lower() if lower<'a' or lower>'z': if term: if not(stop_list.has_key(term)): stem=porter2.stem(term) if pagetrie.has_key(stem): v=pagetrie[stem] if v[5].has_key(docId): v[5][docId]+=1 else: v[0]+=1 v[5][docId]=1 else: pagetrie[stem]=[1,0,0,0,0,{docId:1},{},{},{},{}] term='' else: term+=lower if len(term)>0 and not(stop_list.has_key(term)): stem=porter2.stem(term) if pagetrie.has_key(stem): v=pagetrie[stem] if v[5].has_key(docId): v[5][docId]+=1 else: v[0]+=1 v[5][docId]=1 else: pagetrie[stem]=[1,0,0,0,0,{docId:1},{},{},{},{}] infoflag=0 catflag=0 txtflag=1 outflag=0 print docId,title i+=1 continue elif line[:7]=="</page>": count+=1 i+=1 continue elif line[:5]=="<text": #text flag l=line.split(">",1) text=[] if l[1][-7:]=="</text>": line=l[1].rsplit("<",1) text.append(l[0]) i+=1 else: text.append(l[1]) i+=1 while True: if lines[i][-8:]=="</text>\n": line=lines[i].rsplit("<",1) text.append(line[0]) break text.append(lines[i]) i+=1 text="".join(text) text="%s\n"%(text) txtlength=text.__len__() j=0 term='' txtflag=1 prevind=0 flag=0 while j<txtlength: if text.startswith("[[",j): j+=2 if (text.startswith("Cate",j)): catflag=1 flag=1 j+=9 else: if flag==1: break outflag=1 term='' txtflag=0 while 1: check=0 if text.startswith("]",j): check=1 lower=text[j].lower() if lower<'a' or lower>'z': if len(term)>2: if not(stop_list.has_key(term)): stem=porter2.stem(term) if pagetrie.has_key(stem): v=pagetrie[stem] if infoflag: if v[6].has_key(docId): v[6][docId]+=1 else: v[1]+=1 v[6][docId]=1 if catflag: if v[7].has_key(docId): v[7][docId]+=1 else: v[2]+=1 v[7][docId]=1 if outflag: if v[8].has_key(docId): v[8][docId]+=1 else: v[3]+=1 v[8][docId]=1 else: pagetrie[stem]=[0,infoflag,catflag,outflag,0,{},{},{},{},{}] v=pagetrie[stem] if infoflag: v[6][docId]=1 if catflag: v[7][docId]=1 if outflag: v[8][docId]=1 term='' else: term+=lower if check==1: outflag=0 catflag=0 j+=2 break j+=1 if not(infoflag): txtflag=1 elif text.startswith("{{",j): j+=2 if text.startswith("Info",j): infoflag=1 txtflag=0 j+=7 #print "infostart",infoflag else: while 1: if text[j]=="}" or text[j]=='\n': j+=2 break j+=1 elif text.startswith("}}\n",j): infoflag=0 txtflag=1 #print "infoclose",infoflag j+=3 else: lower=text[j].lower() if lower<'a' or lower>'z': if j-prevind>3: #j-prevind-1 == length term=text[prevind+1:j].lower() if not(stop_list.has_key(term)): stem=porter2.stem(term) if pagetrie.has_key(stem): v=pagetrie[stem] if infoflag: if v[6].has_key(docId): v[6][docId]+=1 else: v[1]+=1 v[6][docId]=1 if catflag: if v[7].has_key(docId): v[7][docId]+=1 else: v[2]+=1 v[stem][7][docId]=1 if outflag: if v[8].has_key(docId): v[8][docId]+=1 else: v[3]+=1 v[8][docId]=1 if txtflag: if v[9].has_key(docId): v[9][docId]+=1 else: v[4]+=1 v[9][docId]=1 else: pagetrie[stem]=[0,infoflag,catflag,outflag,txtflag,{},{},{},{},{}] v=pagetrie[stem] if infoflag: v[6][docId]=1 if catflag: v[7][docId]=1 if outflag: v[8][docId]=1 if txtflag: v[9][docId]=1 prevind=j j+=1 continue prevind=j i+=1 print "proc%d"%(file) printTrie(file,pagetrie) pagetrie.clear() return count
k = 0 scores = {} length = {} while k < len(keywords): if len(keywords[k]) == 0: k += 1 continue if keywords[k] in "TICOB": field = keywords[k] k += 1 continue lower = keywords[k].lower() if stop_list.has_key(lower): k += 1 continue stem = porter2.stem(lower) word = bisect.bisect_left(m, stem) if m[word] != stem: word -= 1 #print stem,word,m[word],l[m[word]] seek = base.base62todecimal(l[m[word]]) g.seek(seek) counter = 0 check = 0 while counter < width: line = g.readline() line = line.rstrip("\n") text = line.split("#", 1) if text[0] == stem: check = 1 break