def load_clueweb12_B13_termstat_stemmed() -> Tuple[Dict, Dict]: from krovetzstemmer import Stemmer stemmer = Stemmer() tf, df = load_clueweb12_B13_termstat() new_tf = Counter() for key, cnt in tf.items(): new_tf[stemmer.stem(key)] += cnt pass df_info = defaultdict(list) for key, cnt in df.items(): df_info[stemmer.stem(key)].append(cnt) new_df = Counter() for key, cnt_list in df_info.items(): cnt_list.sort(reverse=True) discount = 1 discount_factor = 0.3 df_est = 0 for cnt in cnt_list: df_est += cnt * discount discount *= discount_factor new_df[key] = int(df_est) return new_tf, new_df
def clean_text(text): stemmer = Stemmer() return [ stemmer.stem(token.lower()) for token in nltk.word_tokenize(re.sub("[ ]+", " ", re.sub("\n", "", text))) if (token.isalnum() and token not in stopwords.words("english")) ]
def main(): global args save_prefix = args.s if args.f is None: print('Please specify input file.') return # read file with open(args.f, 'r') as f: document = f.read() # cut puncation document = re.sub("[<>:;()\?\.\!\/_,&%^*(+\"\']+", " ", document) # stop word list, also can be replaced with to your's list stop_word_list = stopwords.words('english') # split word, lower word, filter stop word s = Stemmer() word_list = [ s.stem(w.lower()) for w in word_tokenize(text=document) if w not in stop_word_list ] with open(save_prefix + '/result.txt', 'ab') as f: f.write(' '.join(w for w in word_list))
def stem(algo, text): if algo == "krovetz": stemmer = Stemmer() return stemmer.stem(text) elif algo == "porter": stm = PorterStemmer() return stm.stem(text) print("ERROR STEMMING: {t} unkown.".format(t=algo))
def stem(algo,text): if algo=="krovetz": stemmer = Stemmer() return(stemmer.stem(text)) elif algo=="porter": s=stm() return(s.stem(text)) else: print("ERROR STEMMING: {t} unkown.".format(t=algo)) exit()
def stem(text): # print("Stemming...") stemmer = Stemmer() stemmed = "" for word in text.split(): if word == 'docid': stemmed = stemmed + '\n' stemmed = stemmed + ' ' + stemmer.stem(word) return stemmed
def load_df_stemmed(term_stat_path): stemmer = Stemmer() df = load_df(term_stat_path) new_df = Counter() for key, value in df.items(): try: new_df[stemmer.stem(key)] += value except UnicodeDecodeError: pass return new_df
def tokenize(text): stemmer = Stemmer() return [ stemmer.stem(token.lower()) for token in nltk.word_tokenize( re.sub( "\n", "", text.translate( str.maketrans(punctuation, " " * len(punctuation))), )) if (token.isalnum() and token.lower() not in stopwords.words("english") and len(token) > 1) ]
def read_past_winners_file(winners_file): winners_data ={} stemmer = Stemmer() with open(winners_file) as file: for line in file: query = line.split("@@@")[0] text = line.split("@@@")[1] if query not in winners_data: winners_data[query]=[] text = " ".join([stemmer.stem(word) for word in clean_text(text).split()]) winners_data[query].append(text) return winners_data
def modify_text(text, index, query): stemmer = Stemmer() query_terms = [stemmer.stem(q) for q in query.split()] new_text = "" if index == 4: new_text = query + text + query return new_text elif index == 0: p = 0.5 elif index == 2: p = 0.2 tokens = clean_texts(text).split() for token in tokens: if stemmer.stem(token) in query_terms: if random() < p: continue new_text += token + " " return new_text
class PCTokenizer: def __init__(self): self.stemmer = Stemmer() def tokenize_stem(self, text: str) -> List[str]: tokens = nltk.tokenize.word_tokenize(text) stemmed_tokens = [] for t in tokens: try: stemmed_tokens.append(self.stemmer.stem(t)) except: pass return stemmed_tokens
class Tokenizer: def __init__(self, vocab_path, unk="<UNK>", pad="<PAD>"): self.vocab_path = vocab_path self.unk = unk self.pad = pad self.word2idx = self.load_vocab(vocab_path) self.sws = {} for w in stopwords.words('english'): self.sws[w] = 1 self.stemmer = Stemmer() def load_vocab(self, vocab_path): word2idx = {} word2idx[self.pad] = 0 word2idx[self.unk] = 1 with open(vocab_path) as fin: for step, line in enumerate(fin): tokens = line.strip().split() word2idx[tokens[0]] = step + 2 return word2idx def tok2idx(self, toks, word2idx): input_ids = [] for tok in toks: if tok in word2idx: input_ids.append(word2idx[tok]) else: input_ids.append(word2idx['<UNK>']) return input_ids def tokenize(self, line): regex_drop_char = re.compile('[^a-z0-9\s]+') regex_multi_space = re.compile('\s+') toks = regex_multi_space.sub(' ', regex_drop_char.sub( ' ', line.lower())).strip().split() wordsFiltered = [] for w in toks: if w not in self.sws: w = self.stemmer.stem(w) wordsFiltered.append(w) return wordsFiltered def convert_tokens_to_ids(self, toks): input_ids = [] for tok in toks: if tok in self.word2idx: input_ids.append(self.word2idx[tok]) else: input_ids.append(self.word2idx[self.unk]) return input_ids
class CacheStemmer: def __init__(self): self.stemmer = Stemmer() self.stem_dict = dict() def stem(self, token): if token in self.stem_dict: return self.stem_dict[token] else: r = self.stemmer.stem(token) self.stem_dict[token] = r return r def stem_list(self, tokens): return list([self.stem(t) for t in tokens])
def data_to_wordsentences(raw_data): """ convert a text to list of sentences :param raw_data: a text to be converted :return: list if sentences """ sentences = [] tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') raw_sentences = tokenizer.tokenize(raw_data.text.strip()) for raw_sentence in raw_sentences: if len(raw_sentence) > 0: words = tokenise_text(raw_sentence) stemmer = Stemmer() for idx, w in enumerate(words): words[idx] = stemmer.stem(w.decode("utf-8", "ignore")) sentences.append(words) return sentences
def generate_sentences_list_from_raw_text_list(raw_text_list): """ convert list of texts into list of sentences for the traning of Word2Vec :param raw_text_list: list of texts to be converted :return: list if sentences """ sentences_list = [] stemmer = Stemmer() tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') for i in range(len(raw_text_list)): raw_sentences = tokenizer.tokenize(raw_text_list[i]) for raw_sentence in raw_sentences: if len(raw_sentence) > 0: words = tokenise_text(raw_sentence) for idx, w in enumerate(words): words[idx] = stemmer.stem(w.decode("utf-8", "ignore")) sentences_list.append(words) return sentences_list
class StemmerCache: def __init__(self, cache=None): from krovetzstemmer import Stemmer self.stemmer = Stemmer() if cache is not None: self.cache = cache else: self.cache = dict() def stem(self, t): if t in self.cache: return self.cache[t] else: r = self.stemmer.stem(t) self.cache[t] = r if len(self.cache) % 1000 == 0: pickle.dump(self.cache, open("stemmer.pickle", "wb")) return r
def get_text_centroid(text, model,stemmer=None): sum_vector = None denom = 0 if stemmer is not None: stem = Stemmer() for token in clean_sentence(text): if stemmer is not None: token = stem.stem(token) try: vector = model.wv[token] except KeyError: continue if sum_vector is None: sum_vector = np.zeros(vector.shape[0]) sum_vector = sum_vector + vector denom += 1 if sum_vector is None: return None return sum_vector / denom
def get_stemmed_words_index(self, window_words_index): ''' Get stemmed-words index from window-words index :param window_words_index: :return: ''' all_words = self.get_all_words(window_words_index) stem_words_index = {} krovetz = KrovetzStemmer() for word in all_words: # Stem word using krovetz stemmed_word = krovetz.stem(word) # Group by stemmed word stem_words_index.setdefault(stemmed_word, []) stem_words_index[stemmed_word].append(word) return stem_words_index
def __init__(self, d: Dict[WordAsID, np.array], skip_stopwords=True, stem=True): self.tokenizer = get_tokenizer() self.stopwords_as_ids: Set[WordAsID] = set() new_d = {} if skip_stopwords: stopwords = load_stopwords_for_query() for key in d.keys(): tokens = decode_word_as_id(self.tokenizer, key) if len(tokens) == 1 and tokens[0] in stopwords: pass self.stopwords_as_ids.add(key) else: new_d[key] = d[key] d = new_d if stem: d_raw = defaultdict(list) stemmer = Stemmer() for key in d.keys(): tokens = decode_word_as_id(self.tokenizer, key) plain_word = pretty_tokens(tokens, True) stemmed = stemmer.stem(plain_word) d_raw[stemmed].append(d[key]) new_d: Dict[str, TokenScore] = {} for key, items in d_raw.items(): score: TokenScore = [average([t[0] for t in items]), average([t[1] for t in items])] new_d[key] = score d = new_d self.stem = True self.stemmer = stemmer self.log_odd = self.log_odd_w_stem self.d = d self.smoothing = 0.1
tok = Tokenizer() a = A() text = "She even shows-me her boobs and I like it.\nHello world!" print(A.tok(text)) print( list(token.lower() for token in word_tokenize( re.sub( "\n", "", text.translate(str.maketrans(punctuation, " " * len(punctuation))))) if token.isalnum)) print( list(token.lower() for token in word_tokenize( re.sub( "\n", "", text.translate(str.maketrans(punctuation, " " * len(punctuation))))) if token.isalnum and token.lower() not in stopwords.words("english"))) stemmer = Stemmer() print( list( stemmer.stem(token.lower()) for token in word_tokenize( re.sub( "\n", "", text.translate( str.maketrans(punctuation, " " * len(punctuation))))) if token.isalnum and token.lower() not in stopwords.words("english")))
def get_term_frequency(text,term): stemmer = Stemmer() return [stemmer.stem(token) for token in text.split()].count(term)
from krovetzstemmer import Stemmer as KrovetzStemmer import unicodecsv as csv from prettyprint import prettyprint # Instantiate krovetz stemmer krovetz = KrovetzStemmer() # Read result of 1_index with open('1_2_index.txt', 'rb') as f: str_word_files_index = f.read() word_files_index = json.loads(str_word_files_index) stem_word_index = {} for word, files in word_files_index.items(): # Stem word using krovetz stemmed_word = krovetz.stem(word) # Group by stemmed word stem_word_index.setdefault(stemmed_word, []) stem_word_index[stemmed_word].append(word) for stemmed_word, words in stem_word_index.items(): print(u'{}: {}'.format(stemmed_word, ', '.join(words))) print '' filename = '3_stemmed_words.csv' with open(filename, 'wb') as f: print('Writing to file {}'.format(filename)) writer = csv.writer(f) for stemmed_word, words in stem_word_index.items():
from sklearn.feature_extraction.text import CountVectorizer import codecs EPS = 10e-7 import string table = str.maketrans('', '', '!"#$%\'()*+,-./:;<=>?@[\\]^_`{|}~') # Krovetz stemmer est un stemmer moins "destructif" que le porter. # Viewing morphology as an inference process: https://dl.acm.org/citation.cfm?id=160718 from krovetzstemmer import Stemmer #stemmer pas mal pour la PR ks = Stemmer() CUSTOM_FILTERS = [ lambda x: x.lower(), strip_tags, strip_multiple_whitespaces, strip_punctuation, remove_stopwords, lambda x: ks.stem(x) ] def custom_tokenizer(s): return [ w.translate(table) for w in preprocess_string(s, [ strip_tags, lambda x: strip_short(x, 2), remove_stopwords, lambda x: ks.stem(x) ]) ] class Dataset: def __init__(self): pass
def get_sentence_vector(sentence, model): stemmer = Stemmer() sentence = clean_text(sentence) words = sentence.split() stemmed = [stemmer.stem(w) for w in words] return get_stemmed_document_vector(stemmed, model)
if stemmed: stemmer = Stemmer() vocab = load_from_pickle_file("preprocessing/pre_data/vocabulary") for _, query in tqdm(queries_obj.items()): vocab.update(query.title.split()) vocab.update(query.desc.split()) mapping_stemmed = {} print("Stemming...") for word in tqdm(vocab): mapping_stemmed[word] = stemmer.stem(word) for _, doc in tqdm(corpus_obj.docs.items()): doc.headline = " ".join( [mapping_stemmed[word] for word in doc.headline.split()]) doc.content = " ".join( [mapping_stemmed[word] for word in doc.content.split()]) for _, query in tqdm(queries_obj.items()): query.title = " ".join( [mapping_stemmed[word] for word in query.title.split()]) query.desc = " ".join( [mapping_stemmed[word] for word in query.desc.split()]) corpus_sent = [ list(map(lambda w: mapping_stemmed[w], sent))
import nltk nltk.download('punkt') nltk.download('stopwords') from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) from krovetzstemmer import Stemmer stemmer = Stemmer() s = "According to Wikipedia, Information Retrieval is the activity of obtaining information resources relevant to an information need from a collection of information resources." tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(s) print tokens words = [w.lower() for w in tokens] print words non_stopped_words = [w for w in words if not w in stopwords] print non_stopped_words stemmed_words = [stemmer.stem(w) for w in non_stopped_words] print stemmed_words
from Porter import PorterStemmer from krovetzstemmer import Stemmer from common import readTextFromFile from common import getTextFromHTML krov = Stemmer() f = 'en/articles/d/o/r/Dorothy_Block_a8f8.html' text = getTextFromHTML(readTextFromFile(f)) print 'ori:\n', text, '\n' print 'porter:\n', PorterStemmer.useStemer(text), '\n' print 'krov:\n', krov.stem(text), '\n'
class KrovetzStemmer(Stemming): def __init__(self): self.stemmer = Stemmer() def stem(self, text: Text) -> Text: return self.stemmer.stem(text)