def main(): global args save_prefix = args.s if args.f is None: print('Please specify input file.') return # read file with open(args.f, 'r') as f: document = f.read() # cut puncation document = re.sub("[<>:;()\?\.\!\/_,&%^*(+\"\']+", " ", document) # stop word list, also can be replaced with to your's list stop_word_list = stopwords.words('english') # split word, lower word, filter stop word s = Stemmer() word_list = [ s.stem(w.lower()) for w in word_tokenize(text=document) if w not in stop_word_list ] with open(save_prefix + '/result.txt', 'ab') as f: f.write(' '.join(w for w in word_list))
def clean_text(text): stemmer = Stemmer() return [ stemmer.stem(token.lower()) for token in nltk.word_tokenize(re.sub("[ ]+", " ", re.sub("\n", "", text))) if (token.isalnum() and token not in stopwords.words("english")) ]
def __init__(self, cache=None): from krovetzstemmer import Stemmer self.stemmer = Stemmer() if cache is not None: self.cache = cache else: self.cache = dict()
def load_clueweb12_B13_termstat_stemmed() -> Tuple[Dict, Dict]: from krovetzstemmer import Stemmer stemmer = Stemmer() tf, df = load_clueweb12_B13_termstat() new_tf = Counter() for key, cnt in tf.items(): new_tf[stemmer.stem(key)] += cnt pass df_info = defaultdict(list) for key, cnt in df.items(): df_info[stemmer.stem(key)].append(cnt) new_df = Counter() for key, cnt_list in df_info.items(): cnt_list.sort(reverse=True) discount = 1 discount_factor = 0.3 df_est = 0 for cnt in cnt_list: df_est += cnt * discount discount *= discount_factor new_df[key] = int(df_est) return new_tf, new_df
def stem(algo, text): if algo == "krovetz": stemmer = Stemmer() return stemmer.stem(text) elif algo == "porter": stm = PorterStemmer() return stm.stem(text) print("ERROR STEMMING: {t} unkown.".format(t=algo))
def __init__(self, vocab_path, unk="<UNK>", pad="<PAD>"): self.vocab_path = vocab_path self.unk = unk self.pad = pad self.word2idx = self.load_vocab(vocab_path) self.sws = {} for w in stopwords.words('english'): self.sws[w] = 1 self.stemmer = Stemmer()
def stem(algo,text): if algo=="krovetz": stemmer = Stemmer() return(stemmer.stem(text)) elif algo=="porter": s=stm() return(s.stem(text)) else: print("ERROR STEMMING: {t} unkown.".format(t=algo)) exit()
def stem(text): # print("Stemming...") stemmer = Stemmer() stemmed = "" for word in text.split(): if word == 'docid': stemmed = stemmed + '\n' stemmed = stemmed + ' ' + stemmer.stem(word) return stemmed
def load_df_stemmed(term_stat_path): stemmer = Stemmer() df = load_df(term_stat_path) new_df = Counter() for key, value in df.items(): try: new_df[stemmer.stem(key)] += value except UnicodeDecodeError: pass return new_df
def read_past_winners_file(winners_file): winners_data ={} stemmer = Stemmer() with open(winners_file) as file: for line in file: query = line.split("@@@")[0] text = line.split("@@@")[1] if query not in winners_data: winners_data[query]=[] text = " ".join([stemmer.stem(word) for word in clean_text(text).split()]) winners_data[query].append(text) return winners_data
def tokenize(text): stemmer = Stemmer() return [ stemmer.stem(token.lower()) for token in nltk.word_tokenize( re.sub( "\n", "", text.translate( str.maketrans(punctuation, " " * len(punctuation))), )) if (token.isalnum() and token.lower() not in stopwords.words("english") and len(token) > 1) ]
def data_to_wordsentences(raw_data): """ convert a text to list of sentences :param raw_data: a text to be converted :return: list if sentences """ sentences = [] tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') raw_sentences = tokenizer.tokenize(raw_data.text.strip()) for raw_sentence in raw_sentences: if len(raw_sentence) > 0: words = tokenise_text(raw_sentence) stemmer = Stemmer() for idx, w in enumerate(words): words[idx] = stemmer.stem(w.decode("utf-8", "ignore")) sentences.append(words) return sentences
def tokenize(text, stemming=True, stoplist=[], remove_digits=False, lang='en'): translator = str.maketrans( string.punctuation, ' ' * len(string.punctuation)) # map punctuation to space text = text.translate(translator) text = text.lower() text = text.strip() table = str.maketrans({key: None for key in string.punctuation}) text = text.translate(table) if stemming: if lang == 'en': stemmer = Stemmer() elif lang == 'it': stemmer = SnowballStemmer('italian') elif lang == 'de': stemmer = SnowballStemmer('german') elif lang == 'fa': stemmer = paStemmer() analyzer = StemmingAnalyzer(stoplist=stoplist, minsize=1, stemfn=stemmer.stem) else: analyzer = StandardAnalyzer(stoplist=stoplist, minsize=1) tokens = [token.text for token in analyzer(text)] if remove_digits: tokens = [ word for word in tokens if not contains_digits(word) and 2 <= len(word) ] return tokens
def generate_sentences_list_from_raw_text_list(raw_text_list): """ convert list of texts into list of sentences for the traning of Word2Vec :param raw_text_list: list of texts to be converted :return: list if sentences """ sentences_list = [] stemmer = Stemmer() tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') for i in range(len(raw_text_list)): raw_sentences = tokenizer.tokenize(raw_text_list[i]) for raw_sentence in raw_sentences: if len(raw_sentence) > 0: words = tokenise_text(raw_sentence) for idx, w in enumerate(words): words[idx] = stemmer.stem(w.decode("utf-8", "ignore")) sentences_list.append(words) return sentences_list
def get_text_centroid(text, model,stemmer=None): sum_vector = None denom = 0 if stemmer is not None: stem = Stemmer() for token in clean_sentence(text): if stemmer is not None: token = stem.stem(token) try: vector = model.wv[token] except KeyError: continue if sum_vector is None: sum_vector = np.zeros(vector.shape[0]) sum_vector = sum_vector + vector denom += 1 if sum_vector is None: return None return sum_vector / denom
def get_stemmed_words_index(self, window_words_index): ''' Get stemmed-words index from window-words index :param window_words_index: :return: ''' all_words = self.get_all_words(window_words_index) stem_words_index = {} krovetz = KrovetzStemmer() for word in all_words: # Stem word using krovetz stemmed_word = krovetz.stem(word) # Group by stemmed word stem_words_index.setdefault(stemmed_word, []) stem_words_index[stemmed_word].append(word) return stem_words_index
def modify_text(text, index, query): stemmer = Stemmer() query_terms = [stemmer.stem(q) for q in query.split()] new_text = "" if index == 4: new_text = query + text + query return new_text elif index == 0: p = 0.5 elif index == 2: p = 0.2 tokens = clean_texts(text).split() for token in tokens: if stemmer.stem(token) in query_terms: if random() < p: continue new_text += token + " " return new_text
class Tokenizer: def __init__(self, vocab_path, unk="<UNK>", pad="<PAD>"): self.vocab_path = vocab_path self.unk = unk self.pad = pad self.word2idx = self.load_vocab(vocab_path) self.sws = {} for w in stopwords.words('english'): self.sws[w] = 1 self.stemmer = Stemmer() def load_vocab(self, vocab_path): word2idx = {} word2idx[self.pad] = 0 word2idx[self.unk] = 1 with open(vocab_path) as fin: for step, line in enumerate(fin): tokens = line.strip().split() word2idx[tokens[0]] = step + 2 return word2idx def tok2idx(self, toks, word2idx): input_ids = [] for tok in toks: if tok in word2idx: input_ids.append(word2idx[tok]) else: input_ids.append(word2idx['<UNK>']) return input_ids def tokenize(self, line): regex_drop_char = re.compile('[^a-z0-9\s]+') regex_multi_space = re.compile('\s+') toks = regex_multi_space.sub(' ', regex_drop_char.sub( ' ', line.lower())).strip().split() wordsFiltered = [] for w in toks: if w not in self.sws: w = self.stemmer.stem(w) wordsFiltered.append(w) return wordsFiltered def convert_tokens_to_ids(self, toks): input_ids = [] for tok in toks: if tok in self.word2idx: input_ids.append(self.word2idx[tok]) else: input_ids.append(self.word2idx[self.unk]) return input_ids
def get_max_values(queries: Dict[str, str]) -> Dict[str, float]: tf, df = load_clueweb12_B13_termstat_stemmed() stemmer = Stemmer() avdl = 500 bm25_module = BM25(df, cdf, avdl) score_d = {} for qid, query_text in queries.items(): q_terms = extract_terms_from_structured_query(query_text) q_terms_stemmed: List[str] = lmap(stemmer.stem, q_terms) q_tf = Counter(q_terms_stemmed) d_tf = q_tf score = bm25_module.score_inner(q_tf, d_tf) score_d[qid] = score return score_d
class PCTokenizer: def __init__(self): self.stemmer = Stemmer() def tokenize_stem(self, text: str) -> List[str]: tokens = nltk.tokenize.word_tokenize(text) stemmed_tokens = [] for t in tokens: try: stemmed_tokens.append(self.stemmer.stem(t)) except: pass return stemmed_tokens
def __init__(self, d: Dict[WordAsID, np.array], skip_stopwords=True, stem=True): self.tokenizer = get_tokenizer() self.stopwords_as_ids: Set[WordAsID] = set() new_d = {} if skip_stopwords: stopwords = load_stopwords_for_query() for key in d.keys(): tokens = decode_word_as_id(self.tokenizer, key) if len(tokens) == 1 and tokens[0] in stopwords: pass self.stopwords_as_ids.add(key) else: new_d[key] = d[key] d = new_d if stem: d_raw = defaultdict(list) stemmer = Stemmer() for key in d.keys(): tokens = decode_word_as_id(self.tokenizer, key) plain_word = pretty_tokens(tokens, True) stemmed = stemmer.stem(plain_word) d_raw[stemmed].append(d[key]) new_d: Dict[str, TokenScore] = {} for key, items in d_raw.items(): score: TokenScore = [average([t[0] for t in items]), average([t[1] for t in items])] new_d[key] = score d = new_d self.stem = True self.stemmer = stemmer self.log_odd = self.log_odd_w_stem self.d = d self.smoothing = 0.1
class CacheStemmer: def __init__(self): self.stemmer = Stemmer() self.stem_dict = dict() def stem(self, token): if token in self.stem_dict: return self.stem_dict[token] else: r = self.stemmer.stem(token) self.stem_dict[token] = r return r def stem_list(self, tokens): return list([self.stem(t) for t in tokens])
def build_krovetz_index(): stemmer = Stemmer() stopwords = load_stopwords() stem_dict = dict() def stem(token): if token in stem_dict: return stem_dict[token] else: r = stemmer.stem(token) stem_dict[token] = r return r collection = trec.load_robust(trec.robust_path) print("writing...") inv_index = dict() ticker = TimeEstimator(len(collection)) for doc_id in collection: content = collection[doc_id] tokens = nltk.tokenize.wordpunct_tokenize(content) terms = dict() for idx, t in enumerate(tokens): if t in stopwords: continue t_s = stem(t) if t_s not in terms: terms[t_s] = list() terms[t_s].append(idx) for t_s in terms: if t_s not in inv_index: inv_index[t_s] = list() posting = (doc_id, terms[t_s]) inv_index[t_s].append(posting) ticker.tick() save_path = os.path.join(cpath.data_path, "adhoc", "robust_inv_index.pickle") pickle.dump(inv_index, open(save_path, "wb"))
def lm_protext_ex(self): train_X, train_Y = load_protest.load_data("train") dev_X, dev_Y = load_protest.load_data("dev") stemmer = Stemmer() dir_protest = os.path.join(cpath.data_path, "protest", "pseudo_docs", "dbpedia") dir_crime = os.path.join(cpath.data_path, "protest", "crime_docs") tf_path = os.path.join(cpath.data_path, "protest", "pseudo_docs", "tf_dump_100.txt") tokenizer = lambda x: tokenize(x, set(), False) n_docs = 3000 protest_docs = controversy.load_dir_docs(dir_protest)[:n_docs] protest_docs = list([x[2] for x in protest_docs]) crime_docs = controversy.load_dir_docs(dir_crime)[:1000] crime_docs = list([x[2] for x in crime_docs]) bg_ctf, bg_tf = galagos.basic.load_tf(tf_path) print("Using {} docs".format(len(protest_docs))) classifier = LMClassifierEx(tokenizer, stemmer) classifier.build([protest_docs, crime_docs], bg_tf, bg_ctf) classifier.smoothing = 0.01 x_list = list([x[1] for x in train_X]) y_list = list([train_Y[x[0]] for x in train_X]) classifier.fulltext = True def get_ap(y_rank): y_rank.sort(key=lambda x: x[1], reverse=True) return AP(left(y_rank), dev_Y) classifier.tune_alpha(x_list, y_list) y_rank_method = [] for name, doc in dev_X: s = classifier.get_score(doc) y_rank_method.append((name, s)) print("AP(LM_ex) :", get_ap(y_rank_method)) classifier.alpha_list = [0,-9999,0] y_rank_method = [] for name, doc in dev_X: s = classifier.get_score(doc) y_rank_method.append((name, s)) print("AP(LM_ex) before tune:", get_ap(y_rank_method))
class StemmerCache: def __init__(self, cache=None): from krovetzstemmer import Stemmer self.stemmer = Stemmer() if cache is not None: self.cache = cache else: self.cache = dict() def stem(self, t): if t in self.cache: return self.cache[t] else: r = self.stemmer.stem(t) self.cache[t] = r if len(self.cache) % 1000 == 0: pickle.dump(self.cache, open("stemmer.pickle", "wb")) return r
def __init__(self, *, terms, scores=None, probs=None, stem=False, remove_stop=False): """scores: a list of scores, can be log value or probabilities terms: [['a', 'b', 'c', 'd'], ['e', 'f', 'g', 'h']] or ['a b c d', 'e f g h'] or [[['a', 0.4], ['b', 0.3], ['c', 0.2]], [['e', 0.5], ['f', 0.5]]] """ if (scores and probs) or (not scores and not probs): raise Exception('One of scores and probs must be specified.') if scores and not probs: probs = [math.exp(s - scores[0]) for s in scores] probs = normalize(probs) stemmer = Stemmer() if isinstance(terms[0], str): terms = [s.split() for s in terms] else: assert hasattr(terms[0], '__iter__') if stem: terms = [list(map(stemmer.stem, s)) for s in terms] rm = {} for doc_prob, term_list in zip(probs, terms): length = len(term_list) for term, occur in Counter(term_list).items(): rm.setdefault(term, 0.0) rm[term] += doc_prob * (occur / length) # Removing stop words must be after generating the # distribution because it changes document length. if remove_stop: rm = {t: p for t, p in rm.items() if t not in stopwords} rm = normalize(rm) self._rm = rm
def get_wiki_doc_lm(fulltext=False): print("Building LM from wikipedia controversy list") train_data = amsterdam.get_train_data(separate=True) pos_entries, neg_entries = train_data stemmer = Stemmer() def doc_rep(entry): return entry["title"] + "\t" + entry["content"] pos_docs = list(map(doc_rep, pos_entries)) neg_docs = list(map(doc_rep, neg_entries)) y = list(1 for _ in pos_docs) + list(0 for _ in neg_docs) all_docs = pos_docs + neg_docs tokenizer = lambda x: tokenize(x, set(), False) classifier = LMClassifer(tokenizer, stemmer, fulltext=True) classifier.build2(all_docs, y) return classifier
def tokenize(text, stemming=True, stoplist=None): kstemmer = Stemmer() translator = str.maketrans( string.punctuation, ' ' * len(string.punctuation)) # map punctuation to space text = text.translate(translator) text = text.lower() text = text.strip() table = str.maketrans({key: None for key in string.punctuation}) text = text.translate(table) if stemming: analyzer = StemmingAnalyzer(stoplist=stoplist, minsize=2, stemfn=kstemmer.stem) else: analyzer = StandardAnalyzer(stoplist=stoplist, minsize=2) tokens = [token.text for token in analyzer(text)] tokens = [word for word in tokens if not contains_digits(word)] return tokens
def get_dbpedia_contrv_lm(): print("Building LM from DBPedia's controversy ranked docs") stemmer = Stemmer() cont_docs = controversy.load_pseudo_controversy_docs("dbpedia")[:7500] print("Using {} docs".format(len(cont_docs))) tokenizer = lambda x: tokenize(x, set(), False) assert cont_docs[0][0] == 1 print("Loading collection stats") bg_ctf, bg_tf = controversy.load_tf("tf_dump_100.txt") bg_ctf = sum(bg_tf.values()) cont_docs_text = list([x[2] for x in cont_docs]) print("Building LM classifier ") classifier = LMClassifer(tokenizer, stemmer, fulltext=False) classifier.build( cont_docs_text, bg_tf, bg_ctf, ) return classifier
def get_wiki_doc(): train_data = amsterdam.get_train_data(separate=True) pos_entries, neg_entries = train_data stemmer = Stemmer() def doc_rep(entry): return entry["title"] + "\t" + entry["content"] pos_docs = list(map(doc_rep, pos_entries)) neg_docs = list(map(doc_rep, neg_entries)) y = list(1 for _ in pos_docs) + list(0 for _ in neg_docs) all_docs = pos_docs + neg_docs tokenizer = lambda x: tokenize(x, set(), False) X = [] voca = set() for doc in all_docs: tokens = tokenizer(doc) voca.update(tokens) X.append(tokens) return X, y, voca