def remove_similar_words(word_list, master_list): # Could replace double for-loops with itertools.combinations ? new_words = [] stemmer = LancasterStemmer() for i in sorted(set(word_list)): x = [] ii = re.sub("[^\w\s]","",i) ii = re.sub("_","",ii) ii = re.sub("[ ]"," ",ii) ii = stemmer.stem(ii) for j in sorted(master_list): jj = re.sub("[^\w\s]","",j.decode('UTF-8')) jj = re.sub("_","",jj) jj = re.sub("[ ]"," ",jj) if i[:3]==j[:3]: jj = stemmer.stem(jj) if ii == jj: sim = nltk.edit_distance(i,j) if sim > 1: x.append(i) else: x.append(i) x = list(set(x)) new_words+=x new_words = list(set(new_words)) master_list=list(set(master_list)) new_words = [i for i in new_words if i not in master_list] master_list.extend(list(set(new_words))) return new_words, master_list
def lemmatize(s): lancaster = LancasterStemmer() words = s.split() for i in range(0, len(words)): words[i] = lancaster.stem(words[i]) s = ' '.join(words) return s
class Tokenizer(): def __init__(self): self.stemmer = LancasterStemmer() def __call__(self, text): return [self.stemmer.stem(token) for token in word_tokenize(text)]
def stem_words(words): stemmer = LancasterStemmer() stems_words = [] for word in words: rstem = stemmer.stem(word) stems_words.append(rstem + " ") return stems_words
def stem_text(text): from nltk.stem import LancasterStemmer ls = LancasterStemmer() tokens = tokenize_text(text) filtered_tokens = [ls.stem(token) for token in tokens] filtered_text = ' '.join(filtered_tokens) return filtered_text
def preprocess(raw_string): stop_words = set(stopwords.words('english')) res = '' if raw_string != 'NA': res = str(raw_string).lower().split() # Remove stopwords res = [word for word in res if word not in stop_words] # Remove punctuation tokenizer = RegexpTokenizer(r'\w+') tmp_res = [] for s in res: tokens = tokenizer.tokenize(s) tmp_res.extend(tokens) res = tmp_res # Stemming stemmer = LancasterStemmer() tmp_res = [] for s in res: if s.isalpha(): tmp_res.append(stemmer.stem(s)) else: tmp_res.append(s) res = tmp_res else: res = ['NA'] return res
def stemming(tokens): stemmer = LancasterStemmer() stems = [] for word in tokens: stem = stemmer.stem(word) stems.append(stem) return stems
def stem_words(words): stemmer = LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def lancaster_stemming(lista_doc): lista = [] stemmer = LancasterStemmer() for doc in lista_doc: for word in doc: lista.append(stemmer.stem(word)) return lista
class Tokenizer(object): def __init__(self): self.tok = RegexpTokenizer(r'some_regular_expression') self.stemmer = LancasterStemmer() def __call__(self, doc): return [self.stemmer.stem(token) for token in self.tok.tokenize(doc)]
def strip_sentiment_list(data): lanc = LancasterStemmer() list_of_words = list() for i in data: new = re.sub(r'http\S+', '', i) for char in [ '!', '?', ',', '.', '@', '#', '&', '\n', '"', '$', '%', "'", '(', ')', '*', '-', '+', '/', '^', '[', ']', '”', '_', ':', ';', '|', '{', '}', '~', '€', '£', '“', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '=', '°', 'º', 'ʖ', '…', '⤵', '↔️' ]: new = new.replace(char, '') new = strip_emoji(new) new = new.lower() new = new.split(" ") for j in new: new_word = lanc.stem(j) list_of_words.append(new_word) return strip_stopwords(list_of_words)
class Baseline(object): def __init__(self): self.stemmer = LancasterStemmer() self.stopwords = set([self.stemmer.stem(word) for word in stopwords]) def stem(self, doc): return [self.stemmer.stem(word) for word in doc] def doc_similarity(self, s1, s2, pairId=None): s1 = s1.lower().split() s2 = s2.lower().split() s1 = self.stem(s1) s2 = self.stem(s2) s1 = set(s1) - self.stopwords s2 = set(s2) - self.stopwords return float(len(s1.intersection(s2)))/((len(s1)+len(s2)))
def read_files(categories): feats = list () #porter = PorterStemmer() lancaster = LancasterStemmer() print("\n##### Reading files...") for category in categories: files = get_filenames_in_folder('Volkskrant/' + category) num_files=0 for f in files: data = open('Volkskrant/' + category + '/' + f, 'r', encoding='UTF-8').read() #data = data.lower() #data = porter.stem(data) tokens = word_tokenize(data) lancaster_list = [lancaster.stem(token) for token in tokens] #bag = bag_of_words(tokens) #ww=high_information([(bag, category)], [category]) #bag = bag_of_words(lancaster_list) #bag = bag_of_non_stopwords(tokens) bag = bag_of_non_stopwords(lancaster_list) feats.append((bag, category)) #print len(tokens) num_files+=1
def getSearchEngineResult(query_dict): result_dict = {} ix = index.open_dir("index") # with ix.searcher(weighting=scoring.BM25F()) as searcher: with ix.searcher(weighting=scoring.ScoringFunction()) as searcher: # TODO - Define your own query parser parser = QueryParser("contents", schema=ix.schema, group=OrGroup.factory(0)) stemmizer = LancasterStemmer() stopWords = set(stopwords.words('english')) # print(stopWords) for qid, q in query_dict.items(): table = str.maketrans('\n?.,!', ' ') q_nomark = q.translate(table) new_q = '' for word in q_nomark.split(' '): if word.lower() not in stopWords: word_stem = stemmizer.stem(word.lower()) new_q += word_stem + ' ' # print(new_q) query = parser.parse(new_q.lower()) results = searcher.search(query, limit=None) # for result in results: # print(result.fields()['docID'], result.score) result_dict[qid] = [result.fields()['docID'] for result in results] return result_dict
def get_word_stems(date, num_subreddits): import nltk from nltk.stem import PorterStemmer from nltk.stem import LancasterStemmer from nltk.stem import WordNetLemmatizer table_id = f"top_{num_subreddits}_word_counts" query = f"""SELECT * FROM `{date}.{table_id}` """ client = bigquery_client() job_config = bigquery.QueryJobConfig() job_config.use_legacy_sql = False query_job = client.query(query, job_config=job_config) df = query_job.to_dataframe() porter = PorterStemmer() lancaster=LancasterStemmer() wordnet_lemmatizer = WordNetLemmatizer() copy = df.copy() copy['porter']=copy['word'].apply(lambda x: porter.stem(x)) copy['lancaster']=copy['word'].apply(lambda x: lancaster.stem(x)) copy['lemmatised']=copy['word'].apply(lambda x: wordnet_lemmatizer.lemmatize(x)) store_blob(copy, bucket_name=date, blob_name=f"{table_id}_stems.csv")
def preproc(twts): stop_words = set(stopwords.words('english')) stab = [] lstem = LancasterStemmer() REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]") REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)") REPLACE_REP = re.compile(r"(\w)\1{2,}") #abst = [i for sublist in dat for i in sublist] i = 0 for tweets in twts: tweets = tweets.lower() tweets = re.sub(r'[^\w\s]', '', tweets) tweets = REPLACE_NO_SPACE.sub("", tweets) tweets = REPLACE_WITH_SPACE.sub(" ", tweets) tweets = REPLACE_REP.sub("", tweets) word_tokens = word_tokenize(tweets) filtered_sentence = [ lstem.stem(w) for w in word_tokens if not w in stop_words ] stab.append(filtered_sentence) i += 1 print(i / len(twts)) stab = [remove_num_nword(i) for i in stab] stab = list(map(lambda x: ' '.join(x), stab)) return stab
def process(self): """ Apply all the specified preprocessing steps to the tokens from a segment :return: Tokens, with preprocessing steps applied to them """ processed_tokens = self.tokens if self.stopword_removal: stop = set(stopwords.words('english')) processed_tokens = [self.case(token[0]) for token in processed_tokens if token[0] not in stop] if self.stemming: if self.stemmer == 'lancaster': ls = LancasterStemmer() processed_tokens = [self.case(ls.stem(token[0])) for token in processed_tokens] elif self.stemmer == 'snowball': ss = SnowballStemmer('english') processed_tokens = [self.case(ss.stem(token[0])) for token in processed_tokens] else: ps = PorterStemmer() processed_tokens = [self.case(ps.stem(token[0])) for token in processed_tokens] if self.lemmatization: lemma = nltk.wordnet.WordNetLemmatizer() processed_tokens = [self.case(lemma.lemmatize(token[0], token[1])) for token in processed_tokens] # no preprocessing method was selected, return only [Bagdad, car,...] instead of [[Bagdad, NNP], [car, NN], [..]..] if not self.stopword_removal and not self.stemming and not self.lemmatization: processed_tokens = [self.case(token[0]) for token in processed_tokens] return processed_tokens
def stem_metn(request): soz_class = NameForm cumle_class = TextForm morf_class = SozForm porter = PorterStemmer() lancaster = LancasterStemmer() k = request.POST.get('metn', '') alqo = request.POST.get('alqo', '') txt = k if alqo == 'Bizim Alqoritm': txt = metn_oxu(k) elif alqo == 'Porter Alqoritmi': txt = porter.stem(txt) elif alqo == 'Lancaster Alqoritmi': txt = lancaster.stem(txt) elif alqo == 'WordNet Alqoritmi': wordnet_lemmatizer = WordNetLemmatizer() txt = metn_oxu(wordnet_lemmatizer.stem(k)) return render(request, 'metn.html', { 'form': soz_class, 'cumle': cumle_class, 'morf': morf_class, 'txt': txt })
def train_bayes(train_list, N_d): log_prior = {} V = set() big_doc = {} log_likelihood = {} alpha = 0.5 lc = LancasterStemmer() tags = ['NN', 'NNS', 'NNP', 'NNPS'] stop_words = set(stopwords.words('english')) for c in train_list.keys(): # log_prior for file in train_list[c]: if c in log_prior: log_prior[c][0] = log_prior[c][0] + 1 / N_d log_prior[c][1] = math.log(log_prior[c][0]) else: log_prior[c] = [] log_prior[c].append(1 / N_d) log_prior[c].append(math.log(1 / N_d)) # V & big_doc cur_file = open(file, 'r') token_file = sent_tokenize(cur_file.read()) for sent in token_file: token_sent = word_tokenize(sent) tagged_sent = nltk.pos_tag(token_sent) for word in tagged_sent: weight = 1 if word[1] in tags: weight = 2 stem_word = lc.stem(word[0]) if stem_word not in stop_words: if stem_word not in V: V.add(stem_word) if c in big_doc: if stem_word in big_doc[c]: big_doc[c][stem_word] += weight else: big_doc[c][stem_word] = weight else: big_doc[c] = {} big_doc[c][stem_word] = weight cur_file.close() big_doc_size = {} for c in train_list.keys(): big_doc_size[c] = sum(big_doc[c].values()) # log_likelihood for c in train_list.keys(): for w in V: if w not in big_doc[c]: log_likelihood[(w, c)] = math.log(alpha / (big_doc_size[c] + (alpha * len(V)))) else: log_likelihood[(w, c)] = math.log( (big_doc[c][w]) / (big_doc_size[c])) return log_prior, log_likelihood, V
def stemmingLS(self): from nltk.stem import LancasterStemmer ls = LancasterStemmer() temp = copy.deepcopy(self.data) for i in range(len(temp)): for j in range(len(temp[i][1])): temp[i][1][j] = ls.stem(temp[i][1][j]) return temp
def lancaster_stemmer(text): lancaster = LancasterStemmer() stemmed_words = [] for word in text.split(" "): stemmed_words.append(lancaster.stem(word)) return " ".join(stemmed_words)
def token_stemming(x): '''Uses the LancasterStemmer from nltk to stem tokens in the provided list of lists `x`''' stemmer = LancasterStemmer() x = [[stemmer.stem(w) for w in s] for s in x] return x
def calculate_uncommon_words_percent(text_file_name): # read the text from a file text_file = open(text_file_name) text = text_file.read() text_file.close() # replace punctuation with empty strings punctuation_marks = [ '.', ',', ':', ';', '(', ')', '!', '?', '[', ']', '$', '"', "'", '’', '-', '–', '●', '\t', '“', '”', '\n', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ] for punctuation_mark in punctuation_marks: text = text.replace(punctuation_mark, ' ') # chop the text into words, using spaces as word dividers words_in_text = text.split() stemmer = LancasterStemmer() stems_in_text = [] for word in words_in_text: stem = stemmer.stem(word) lower_case_stem = stem.lower() stems_in_text.append(lower_case_stem) # read the 1000 most common English words from a file common_words_file = open('most_common_words.txt') common_words_text = common_words_file.read() common_words_file.close() common_words = common_words_text.split() common_stems = [] for word in common_words: stem = stemmer.stem(word) lower_case_stem = stem.lower() common_stems.append(lower_case_stem) uncommon_stems = [] for stem in stems_in_text: if not (stem in common_stems): uncommon_stems.append(stem) pct_uncommon_stems = len(uncommon_stems) / len(stems_in_text) rounded_pct_uncommon_stems = round(pct_uncommon_stems, 3) return rounded_pct_uncommon_stems
def stem_words(words): """Stem words in list of tokenized words""" stemmer = LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def get_stems(self): stemmerlan = LancasterStemmer() stems = [] for i in self.words: stems.append(stemmerlan.stem(i)) print("\n提取词干后:") print(stems) return stems
def lancaster_stem(tokenize_list): l = LancasterStemmer() result = [] for w in tokenize_list: result.append(l.stem(w)) return result
def stem_words(row): stemmed_row = '' lancaster = LancasterStemmer() for word in row: word = word.strip() stemmed_word = lancaster.stem(word) stemmed_row += stemmed_word + ' ' return stemmed_row
def stem_words(words): stemmer = LancasterStemmer() stems_words = [] for word in words: stem = stemmer.stem(word) stems_words.append(stem) stems_words.append(' ') #add space between words return stems_words
def stemSentence(sentence): lancaster = LancasterStemmer() token_words = word_tokenize(sentence) stem_sentence = [] for word in token_words: stem_sentence.append(lancaster.stem(word)) stem_sentence.append(" ") return "".join(stem_sentence)
def stem_words(wrd): stemmer = LancasterStemmer() # Selects the stemmmer from nltk stems = [] # List of updated words for word in wrd: stem = stemmer.stem(word) # Stems the word stems.append(stem) # and appends it to the list return stems
def words_stem(tokens): """ convert all words to root of that word :param tokens: :return: """ from nltk.stem import PorterStemmer, LancasterStemmer stemmer = LancasterStemmer() return [stemmer.stem(word) for word in tokens]
def lemmatization(lista_palabras_sp_ssw_scr): stemmerlan = LancasterStemmer() lista_palabras_sp_ssw_scr_stem = [] for word in lista_palabras_sp_ssw_scr: lista_palabras_sp_ssw_scr_stem.append(stemmerlan.stem(word)) tokens = set(lista_palabras_sp_ssw_scr_stem) #print(tokens) return tokens
class TFIDF: def __init__(self): self.pickle_docs = "tfidf_pickle_docs" self.pickle_corpus = "tfidf_pickle_corpus" self.lan = LancasterStemmer() self.construct() #print sorted(self.words.iteritems(), key = operator.itemgetter(1), reverse=True)[:20] def clean(self, word): '''cleans a word or returns None if it should not be considered''' word = word.strip(string.punctuation) word = self.lan.stem(word) return word def construct(self): corpus = {} # Check to see if we should simply load a pickle if os.path.isfile(self.pickle_docs): with open(self.pickle_docs) as docs_file: current_doclist = pickle.load(docs_file) if os.listdir('articles/') == current_doclist: # current article list is the same as pickled article list # so we want to just load the stored pickled corpus data with open(self.pickle_corpus) as corpus_file: self.words = pickle.load(corpus_file) self.n = len(current_doclist) return # If we don't load a pickle, build the corpus from articles/ dir num_docs = 0.0 for file_name in os.listdir('articles/'): num_docs += 1 doc = {} with open("articles/" + file_name) as article: for line in article: for word in tokenize(line, "word", return_spans=False): word = self.clean(word) doc[word] = 1 for key in doc.keys(): corpus[key] = corpus.get(key, 0) + 1 self.words = corpus self.n = num_docs print "Pickling a new TFIDF corpus" # pickle corpus and document list with open(self.pickle_docs, "w") as docs_file: pickle.dump(os.listdir('articles/'), docs_file) with open(self.pickle_corpus, "w") as corpus_file: pickle.dump(self.words, corpus_file) def weight(self, word, count, debug=False): if debug: return (word, count, self.words.get(word, 1)) return count * math.log(self.n / self.words.get(word, 1))
def stemWords(self, words): """Stem words in list of tokenized words""" if stemmer == "lancaster": stemmer = LancasterStemmer() elif stemmer == "snowbal": stemmer = SnowballStemmer() elif stemmer == "porter": stemmer = PorterStemmer() stems = [stemmer.stem(word) for word in words] return stems
def stemming(word): # Use stemmers for removing morphological affixes from words. Portst = PorterStemmer() Landst = LancasterStemmer() Regst = RegexpStemmer('ing|ed') new = Portst.stem(word) if new == word: new = Landst.stem(word) if new == word: new = Regst.stem(word) return new
class StemTokenizer(object): def __init__(self, stemmer_type='Porter'): self.stemmer_type = stemmer_type if self.stemmer_type == 'Porter': self.stemmer = PorterStemmer() elif self.stemmer_type == 'Lancaster': self.stemmer = LancasterStemmer() else: raise Exception('Invalid stemmer_type = {0}'.format(stemmer_type)) def __call__(self, doc): return [self.stemmer.stem(t) for t in word_tokenize(doc)]
def stem(self, input_text): tokenizer = RegexpTokenizer("\s+", gaps=True) stemmed_text = [] lemmatizer = WordNetLemmatizer() stemmer = LancasterStemmer() text = tokenizer.tokenize(str(input_text)) filtered_text = self.stopword(text) for word in filtered_text: if word.isalpha(): stemmed_text.append(stemmer.stem(word).lower()) " ".join(stemmed_text) return stemmed_text
class LancasterTokenizer(object): def __init__(self): self.ls = LancasterStemmer() self.rx = RegexpTokenizer(r"(?u)\b\w\w+\b") def isNumber(self, s): try: float(s) return True except ValueError: return False def __call__(self, doc): return [self.ls.stem(t) for t in self.rx.tokenize(doc) if not self.isNumber(t)]
class Tokenizer(): """ Tokenizes and stems text using NLTK libraries """ def __init__(self): """ Constructs a tokenizer object """ self.stemmer = LancasterStemmer() def __call__(self, text): """ Tokenizes text :param text: the text to tokenize :type text: str or unicode :return: a list of tokens :rtype: list of (str or unicode) """ return [self.stemmer.stem(token) for token in word_tokenize(text)]
def word_refiner(*args): Portst = PorterStemmer() Landst = LancasterStemmer() Regst = RegexpStemmer('ing|ed|ly|lly') args = [i for i in args if isinstance(i, unicode)] for w in map(str, args): if w in dic1: yield w else: st1 = Portst.stem(w) if st1 in dic1: yield st1 else: st2 = Landst.stem(w) if st2 in dic1: yield st2 else: st3 = Regst.stem(w) if st3 in dic1: yield st3 else: yield w
def preprocess(sentence): output_list = [] #CASE FOLDING [NOT COMPLETE] sentence = sentence.lower() #DATA CLEANING sentence = sentence.replace('[https://]?[t.co/]?','') sentence = sentence.replace('@','') sentence = sentence.replace('[#]?','') sentence = sentence.replace('[RT]?','') sentence = sentence.replace(',','') sentence = sentence.replace('!','') sentence = sentence.replace('?','') sentence = sentence.replace('.','') sentence = sentence.replace('\'','') sentence = sentence.replace('\"','') sentence = sentence.replace(':','') #REMOVE REPEATED CHARS #sentence = re.sub(r'(\w)\1+', r'\1', sentence) #TOKENIZE tt = TweetTokenizer() temp = tt.tokenize(sentence) #REMOVE STOP WORDS stop = stopwords.words('english') #STEMMING ls = LancasterStemmer() newtemp = [eachword for eachword in temp if eachword not in stop] for eachword in newtemp: output_list.append(ls.stem(eachword)) return output_list
class LancasterTokenizer(object): def __init__(self): self.wnl = LancasterStemmer() def __call__(self, doc): return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)]
import nltk from nltk.stem import LancasterStemmer stemmerlan=LancasterStemmer() print(stemmerlan.stem('working')) print(stemmerlan.stem('happiness'))
Tokens2.append(w) #fix ascii again, don't know whats happening here! for w in Tokens2: for char in w: if ord(char) > 128: w. #stemming Tokens3 = [] for w in Tokens2: Tokens3.append(lanStem.stem(w)) return Tokens3 test = clean(tesSum1) #read files tesSum1 = teslaSummary1.read() tesSum5 = teslaSummary5.read() #stpWrds = set(stopwords.words("english")) #print stpWrds sum1SentTok = sentTok(tesSum1) sum2SentTok = sentTok(tesSum2) sum1WordTok = wordTok(tesSum1) sum2WordTok = wordTok(tesSum2) for w in sum1WordTok: tryAgain + LancasterStemmer.stem('stemming')
else: urlUsed.add(url) the_page = Response.read() #url parser soup = BeautifulSoup(the_page,'html.parser') p=soup.findAll('p') for ps in p: psStr = str(ps.get_text().encode('utf-8')) psStr = re.sub(r'[^a-zA-Z\s\n]',' ',psStr) if len(psStr)>2: psStr = nltk.word_tokenize(psStr) for ss in psStr: wordLemmatized = wnl.lemmatize(ss) wordStemed = stem.stem(wordLemmatized) tempArticle += str(wordStemed).lower()+' ' # print(ps.get_text()) if len(tempArticle) > 800: r += 1 writer = open('texts/'+str(r),'w') writer.write(tempArticle) writer.close writer = open('labellist3','a') writer.write(url + '\t' + str(r) + os.linesep) writer.close # if hasChinese(ps.get_text()): # else:
# porter stemmer from nltk.stem import PorterStemmer ps = PorterStemmer() print ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped') print ps.stem('lying') print ps.stem('strange') # lancaster stemmer from nltk.stem import LancasterStemmer ls = LancasterStemmer() print ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped') print ls.stem('lying') print ls.stem('strange') # regex stemmer from nltk.stem import RegexpStemmer rs = RegexpStemmer('ing$|s$|ed$', min=4) print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped') print rs.stem('lying') print rs.stem('strange')
def stemmed(self,word): stemmer = LancasterStemmer() return stemmer.stem(word)
def stem_document(document): from nltk.stem import LancasterStemmer stemmer = LancasterStemmer() return stemmer.stem(document)
# <h2>Stemming Words</h2> # <p>Stemming is the process of removing <em>affixes</em> from a word to obtain it's root, or <em>stem</em>. For example, the stem of <strong> # growing</strong> is <strong>grow</strong>. </p> # <p>Python includes 4 stemming algorithms, 3 of which are demonstrated below. The fourth, <em>Snowball</em> is for non-English languages # and is not covered here but is in the text </p> # <codecell> from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer porter = PorterStemmer() lancaster = LancasterStemmer() reg = RegexpStemmer('ing') g = 'growing' print 'Porter yields: ',porter.stem(g) print 'lancaster yields: ', lancaster.stem(g) print 'Regexp yields: ', reg.stem(g) # <markdowncell> # <p>The output of various words can be different between stemmers:</p> # <codecell> g = 'cookery' print 'Porter yields: ',porter.stem(g) print 'lancaster yields: ', lancaster.stem(g) print 'Regexp yields: ', reg.stem(g) # <markdowncell>
__author__ = "pratap" # Porter Stemmer from nltk.stem import PorterStemmer stemmer = PorterStemmer() print stemmer.stem("cooking") print stemmer.stem("cookery") # Lancaster Stermmer from nltk.stem import LancasterStemmer lanc_stemmer = LancasterStemmer() print lanc_stemmer.stem("cooking") print lanc_stemmer.stem("cookery")
class TFIDF(object): def __init__(self, tfidf_file, id2wordFile=None): self.model = models.TfidfModel.load(tfidf_file) self.stemmer = LancasterStemmer() self.stopwords = set([self._preprocess_word(word) for word in stopwords]) #self.stem_model() print "done" def _preprocess_word(self, word): return self.stemmer.stem(word.lower()) #return word.lower() def stem(self, doc): return [self.stemmer.stem(word) for word in doc] def stem_model(self): print "stemming" new_id2word = corpora.Dictionary() # Create a new dicitonary with the stemmed terms and summed document frequencies for termid, freq in self.model.dfs.iteritems(): stemmed_word = self.stemmer.stem(self.model.id2word[termid]) stemmed_id = None if stemmed_word in new_id2word.token2id: stemmed_id = new_id2word.token2id[stemmed_word] else: stemmed_id = len(new_id2word.token2id) new_id2word.token2id[stemmed_word] = stemmed_id new_id2word.dfs[stemmed_id] = 0 new_id2word.dfs[stemmed_id] += freq # add df from old dicionary new_id2word.num_docs = self.model.id2word.num_docs new_id2word.num_nnz = self.model.id2word.num_nnz new_id2word.num_pos = self.model.id2word.num_pos self.model.id2word = new_id2word self.model.dfs = self.model.id2word.dfs self.model.idfs = precompute_idfs(self.model.wglobal, self.model.dfs, self.model.num_docs) self.model.save('models/all_lancaster.tfidfmodel') print len(new_id2word) print "done stemming" def restrict_vocab(self, corpus): vocab = set() for doc in corpus: for idx, freq in doc: vocab.add(idx) for idx in vocab: dfs[idx] = self.model.dfs[idx] idfs[idx] = self.model.idfs[idx] self.model.dfs = dfs self.model.idfs = idfs def to_bow(self, doc): doc = [self._preprocess_word(word) for word in doc.lower().split() if word not in self.stopwords] return self.model.id2word.doc2bow(doc) def doc_similarity(self, s1, s2, pairId=None): # tfidf1 = self.model[self.to_bow(s1)] # tfidf2 = self.model[self.to_bow(s2)] # index = similarities.MatrixSimilarity([tfidf1],num_features=len(self.model.id2word)) # return math.sqrt(index[tfidf2][0])*4. + 1 tfidf1 = self.model[self.to_bow(s1)] tfidf2 = self.model[self.to_bow(s2)] common_terms = set(zip(*tfidf1)[0])&set(zip(*tfidf2)[0]) similarity = 0. tfidf_total = 0. for term, freq in tfidf1 + tfidf2: if term in common_terms: similarity += freq tfidf_total += freq val = math.sqrt(similarity/tfidf_total)*5. if val < 1.: val +=1. return val
def LancasterTokenizer(s): from nltk import word_tokenize from nltk.stem import LancasterStemmer stemmer = LancasterStemmer() return [stemmer.stem(t) for t in word_tokenize(s)]