Пример #1
0
def main():
    global args
    save_prefix = args.s

    if args.f is None:
        print('Please specify input file.')
        return

    # read file
    with open(args.f, 'r') as f:
        document = f.read()

    # cut puncation
    document = re.sub("[<>:;()\?\.\!\/_,&%^*(+\"\']+", " ", document)

    # stop word list, also can be replaced with to your's list
    stop_word_list = stopwords.words('english')

    # split word, lower word, filter stop word
    s = Stemmer()

    word_list = [
        s.stem(w.lower()) for w in word_tokenize(text=document)
        if w not in stop_word_list
    ]

    with open(save_prefix + '/result.txt', 'ab') as f:
        f.write(' '.join(w for w in word_list))
def clean_text(text):
    stemmer = Stemmer()
    return [
        stemmer.stem(token.lower())
        for token in nltk.word_tokenize(re.sub("[ ]+", " ", re.sub("\n", "", text)))
        if (token.isalnum() and token not in stopwords.words("english"))
    ]
Пример #3
0
 def __init__(self, cache=None):
     from krovetzstemmer import Stemmer
     self.stemmer = Stemmer()
     if cache is not None:
         self.cache = cache
     else:
         self.cache = dict()
Пример #4
0
def load_clueweb12_B13_termstat_stemmed() -> Tuple[Dict, Dict]:
    from krovetzstemmer import Stemmer
    stemmer = Stemmer()
    tf, df = load_clueweb12_B13_termstat()
    new_tf = Counter()

    for key, cnt in tf.items():
        new_tf[stemmer.stem(key)] += cnt
        pass

    df_info = defaultdict(list)
    for key, cnt in df.items():
        df_info[stemmer.stem(key)].append(cnt)

    new_df = Counter()
    for key, cnt_list in df_info.items():
        cnt_list.sort(reverse=True)
        discount = 1
        discount_factor = 0.3
        df_est = 0
        for cnt in cnt_list:
            df_est += cnt * discount
            discount *= discount_factor

        new_df[key] = int(df_est)
    return new_tf, new_df
Пример #5
0
def stem(algo, text):
    if algo == "krovetz":
        stemmer = Stemmer()
        return stemmer.stem(text)
    elif algo == "porter":
        stm = PorterStemmer()
        return stm.stem(text)
    print("ERROR STEMMING: {t} unkown.".format(t=algo))
Пример #6
0
 def __init__(self, vocab_path, unk="<UNK>", pad="<PAD>"):
     self.vocab_path = vocab_path
     self.unk = unk
     self.pad = pad
     self.word2idx = self.load_vocab(vocab_path)
     self.sws = {}
     for w in stopwords.words('english'):
         self.sws[w] = 1
     self.stemmer = Stemmer()
Пример #7
0
def stem(algo,text):
	if algo=="krovetz":
		stemmer = Stemmer()
		return(stemmer.stem(text))
	elif algo=="porter":
		s=stm()
		return(s.stem(text))
	else:
		print("ERROR STEMMING: {t} unkown.".format(t=algo))
		exit()
Пример #8
0
def stem(text):
    # print("Stemming...")
    stemmer = Stemmer()
    stemmed = ""
    for word in text.split():
        if word == 'docid':
            stemmed = stemmed + '\n'
        stemmed = stemmed + ' ' + stemmer.stem(word)

    return stemmed
Пример #9
0
def load_df_stemmed(term_stat_path):
    stemmer = Stemmer()
    df = load_df(term_stat_path)

    new_df = Counter()
    for key, value in df.items():
        try:
            new_df[stemmer.stem(key)] += value
        except UnicodeDecodeError:
            pass
    return new_df
def read_past_winners_file(winners_file):
    winners_data ={}
    stemmer = Stemmer()
    with open(winners_file) as file:
        for line in file:
            query = line.split("@@@")[0]
            text = line.split("@@@")[1]
            if query not in winners_data:
                winners_data[query]=[]
            text = " ".join([stemmer.stem(word) for word in clean_text(text).split()])
            winners_data[query].append(text)
    return winners_data
Пример #11
0
 def tokenize(text):
     stemmer = Stemmer()
     return [
         stemmer.stem(token.lower()) for token in nltk.word_tokenize(
             re.sub(
                 "\n",
                 "",
                 text.translate(
                     str.maketrans(punctuation, " " * len(punctuation))),
             )) if (token.isalnum() and token.lower() not in
                    stopwords.words("english") and len(token) > 1)
     ]
Пример #12
0
def data_to_wordsentences(raw_data):
    """ convert a text to list of sentences
    :param raw_data: a text to be converted
    :return: list if sentences
    """
    sentences = []
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(raw_data.text.strip())
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            words = tokenise_text(raw_sentence)
            stemmer = Stemmer()
            for idx, w in enumerate(words):
                words[idx] = stemmer.stem(w.decode("utf-8", "ignore"))
            sentences.append(words)
    return sentences
Пример #13
0
def tokenize(text, stemming=True, stoplist=[], remove_digits=False, lang='en'):
    translator = str.maketrans(
        string.punctuation,
        ' ' * len(string.punctuation))  # map punctuation to space
    text = text.translate(translator)
    text = text.lower()
    text = text.strip()
    table = str.maketrans({key: None for key in string.punctuation})
    text = text.translate(table)
    if stemming:
        if lang == 'en':
            stemmer = Stemmer()
        elif lang == 'it':
            stemmer = SnowballStemmer('italian')
        elif lang == 'de':
            stemmer = SnowballStemmer('german')
        elif lang == 'fa':
            stemmer = paStemmer()
        analyzer = StemmingAnalyzer(stoplist=stoplist,
                                    minsize=1,
                                    stemfn=stemmer.stem)
    else:
        analyzer = StandardAnalyzer(stoplist=stoplist, minsize=1)

    tokens = [token.text for token in analyzer(text)]
    if remove_digits:
        tokens = [
            word for word in tokens
            if not contains_digits(word) and 2 <= len(word)
        ]
    return tokens
Пример #14
0
def generate_sentences_list_from_raw_text_list(raw_text_list):
    """ convert list of texts into list of sentences for the traning of Word2Vec
    :param raw_text_list: list of texts to be converted
    :return: list if sentences
    """

    sentences_list = []
    stemmer = Stemmer()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    for i in range(len(raw_text_list)):
        raw_sentences = tokenizer.tokenize(raw_text_list[i])
        for raw_sentence in raw_sentences:
            if len(raw_sentence) > 0:
                words = tokenise_text(raw_sentence)
                for idx, w in enumerate(words):
                    words[idx] = stemmer.stem(w.decode("utf-8", "ignore"))
                sentences_list.append(words)
    return sentences_list
def get_text_centroid(text, model,stemmer=None):
    sum_vector = None
    denom = 0
    if stemmer is not None:
        stem = Stemmer()
    for token in clean_sentence(text):
        if stemmer is not None:
            token = stem.stem(token)
        try:
            vector = model.wv[token]
        except KeyError:
            continue
        if sum_vector is None:
            sum_vector = np.zeros(vector.shape[0])
        sum_vector = sum_vector + vector
        denom += 1
    if sum_vector is None:
        return None
    return sum_vector / denom
Пример #16
0
    def get_stemmed_words_index(self, window_words_index):
        '''
        Get stemmed-words index from window-words index
        :param window_words_index:
        :return:
        '''

        all_words = self.get_all_words(window_words_index)
        stem_words_index = {}

        krovetz = KrovetzStemmer()
        for word in all_words:
            # Stem word using krovetz
            stemmed_word = krovetz.stem(word)

            # Group by stemmed word
            stem_words_index.setdefault(stemmed_word, [])
            stem_words_index[stemmed_word].append(word)

        return stem_words_index
Пример #17
0
def modify_text(text, index, query):
    stemmer = Stemmer()
    query_terms = [stemmer.stem(q) for q in query.split()]
    new_text = ""

    if index == 4:
        new_text = query + text + query
        return new_text

    elif index == 0:
        p = 0.5
    elif index == 2:
        p = 0.2

    tokens = clean_texts(text).split()

    for token in tokens:
        if stemmer.stem(token) in query_terms:
            if random() < p:
                continue
        new_text += token + " "
    return new_text
Пример #18
0
class Tokenizer:
    def __init__(self, vocab_path, unk="<UNK>", pad="<PAD>"):
        self.vocab_path = vocab_path
        self.unk = unk
        self.pad = pad
        self.word2idx = self.load_vocab(vocab_path)
        self.sws = {}
        for w in stopwords.words('english'):
            self.sws[w] = 1
        self.stemmer = Stemmer()

    def load_vocab(self, vocab_path):
        word2idx = {}
        word2idx[self.pad] = 0
        word2idx[self.unk] = 1
        with open(vocab_path) as fin:
            for step, line in enumerate(fin):
                tokens = line.strip().split()
                word2idx[tokens[0]] = step + 2
        return word2idx

    def tok2idx(self, toks, word2idx):
        input_ids = []
        for tok in toks:
            if tok in word2idx:
                input_ids.append(word2idx[tok])
            else:
                input_ids.append(word2idx['<UNK>'])
        return input_ids

    def tokenize(self, line):
        regex_drop_char = re.compile('[^a-z0-9\s]+')
        regex_multi_space = re.compile('\s+')
        toks = regex_multi_space.sub(' ', regex_drop_char.sub(
            ' ', line.lower())).strip().split()
        wordsFiltered = []
        for w in toks:
            if w not in self.sws:
                w = self.stemmer.stem(w)
                wordsFiltered.append(w)
        return wordsFiltered

    def convert_tokens_to_ids(self, toks):
        input_ids = []
        for tok in toks:
            if tok in self.word2idx:
                input_ids.append(self.word2idx[tok])
            else:
                input_ids.append(self.word2idx[self.unk])
        return input_ids
Пример #19
0
def get_max_values(queries: Dict[str, str]) -> Dict[str, float]:
    tf, df = load_clueweb12_B13_termstat_stemmed()
    stemmer = Stemmer()
    avdl = 500
    bm25_module = BM25(df, cdf, avdl)
    score_d = {}
    for qid, query_text in queries.items():
        q_terms = extract_terms_from_structured_query(query_text)
        q_terms_stemmed: List[str] = lmap(stemmer.stem, q_terms)
        q_tf = Counter(q_terms_stemmed)
        d_tf = q_tf
        score = bm25_module.score_inner(q_tf, d_tf)
        score_d[qid] = score
    return score_d
Пример #20
0
class PCTokenizer:
    def __init__(self):
        self.stemmer = Stemmer()

    def tokenize_stem(self, text: str) -> List[str]:
        tokens = nltk.tokenize.word_tokenize(text)
        stemmed_tokens = []
        for t in tokens:
            try:
                stemmed_tokens.append(self.stemmer.stem(t))
            except:
                pass

        return stemmed_tokens
Пример #21
0
    def __init__(self, d: Dict[WordAsID, np.array], skip_stopwords=True, stem=True):
        self.tokenizer = get_tokenizer()

        self.stopwords_as_ids: Set[WordAsID] = set()
        new_d = {}
        if skip_stopwords:
            stopwords = load_stopwords_for_query()
            for key in d.keys():
                tokens = decode_word_as_id(self.tokenizer, key)
                if len(tokens) == 1 and tokens[0] in stopwords:
                    pass
                    self.stopwords_as_ids.add(key)
                else:
                    new_d[key] = d[key]
            d = new_d

        if stem:
            d_raw = defaultdict(list)
            stemmer = Stemmer()

            for key in d.keys():
                tokens = decode_word_as_id(self.tokenizer, key)
                plain_word = pretty_tokens(tokens, True)
                stemmed = stemmer.stem(plain_word)
                d_raw[stemmed].append(d[key])

            new_d: Dict[str, TokenScore] = {}
            for key, items in d_raw.items():
                score: TokenScore = [average([t[0] for t in items]), average([t[1] for t in items])]
                new_d[key] = score
            d = new_d
            self.stem = True
            self.stemmer = stemmer
            self.log_odd = self.log_odd_w_stem

        self.d = d
        self.smoothing = 0.1
Пример #22
0
class CacheStemmer:
    def __init__(self):
        self.stemmer = Stemmer()
        self.stem_dict = dict()

    def stem(self, token):
        if token in self.stem_dict:
            return self.stem_dict[token]
        else:
            r = self.stemmer.stem(token)
            self.stem_dict[token] = r
            return r

    def stem_list(self, tokens):
        return list([self.stem(t) for t in tokens])
Пример #23
0
def build_krovetz_index():
    stemmer = Stemmer()
    stopwords = load_stopwords()

    stem_dict = dict()

    def stem(token):
        if token in stem_dict:
            return stem_dict[token]
        else:
            r = stemmer.stem(token)
            stem_dict[token] = r
            return r

    collection = trec.load_robust(trec.robust_path)
    print("writing...")
    inv_index = dict()
    ticker = TimeEstimator(len(collection))

    for doc_id in collection:
        content = collection[doc_id]
        tokens = nltk.tokenize.wordpunct_tokenize(content)
        terms = dict()
        for idx, t in enumerate(tokens):
            if t in stopwords:
                continue

            t_s = stem(t)

            if t_s not in terms:
                terms[t_s] = list()

            terms[t_s].append(idx)

        for t_s in terms:
            if t_s not in inv_index:
                inv_index[t_s] = list()

            posting = (doc_id, terms[t_s])
            inv_index[t_s].append(posting)

        ticker.tick()

    save_path = os.path.join(cpath.data_path, "adhoc",
                             "robust_inv_index.pickle")
    pickle.dump(inv_index, open(save_path, "wb"))
Пример #24
0
    def lm_protext_ex(self):
        train_X, train_Y = load_protest.load_data("train")
        dev_X, dev_Y = load_protest.load_data("dev")

        stemmer = Stemmer()
        dir_protest = os.path.join(cpath.data_path, "protest", "pseudo_docs", "dbpedia")
        dir_crime = os.path.join(cpath.data_path, "protest", "crime_docs")
        tf_path = os.path.join(cpath.data_path, "protest", "pseudo_docs", "tf_dump_100.txt")
        tokenizer = lambda x: tokenize(x, set(), False)

        n_docs = 3000
        protest_docs = controversy.load_dir_docs(dir_protest)[:n_docs]
        protest_docs = list([x[2] for x in protest_docs])

        crime_docs = controversy.load_dir_docs(dir_crime)[:1000]
        crime_docs = list([x[2] for x in crime_docs])
        bg_ctf, bg_tf = galagos.basic.load_tf(tf_path)
        print("Using {} docs".format(len(protest_docs)))

        classifier = LMClassifierEx(tokenizer, stemmer)
        classifier.build([protest_docs, crime_docs], bg_tf, bg_ctf)
        classifier.smoothing = 0.01

        x_list = list([x[1] for x in train_X])
        y_list = list([train_Y[x[0]] for x in train_X])

        classifier.fulltext = True
        def get_ap(y_rank):
            y_rank.sort(key=lambda x: x[1], reverse=True)
            return AP(left(y_rank), dev_Y)

        classifier.tune_alpha(x_list, y_list)

        y_rank_method = []
        for name, doc in dev_X:
            s = classifier.get_score(doc)
            y_rank_method.append((name, s))
        print("AP(LM_ex) :", get_ap(y_rank_method))

        classifier.alpha_list = [0,-9999,0]
        y_rank_method = []
        for name, doc in dev_X:
            s = classifier.get_score(doc)
            y_rank_method.append((name, s))
        print("AP(LM_ex) before tune:", get_ap(y_rank_method))
Пример #25
0
class StemmerCache:
    def __init__(self, cache=None):
        from krovetzstemmer import Stemmer
        self.stemmer = Stemmer()
        if cache is not None:
            self.cache = cache
        else:
            self.cache = dict()

    def stem(self, t):
        if t in self.cache:
            return self.cache[t]
        else:
            r = self.stemmer.stem(t)
            self.cache[t] = r
            if len(self.cache) % 1000 == 0:
                pickle.dump(self.cache, open("stemmer.pickle", "wb"))
            return r
Пример #26
0
    def __init__(self,
                 *,
                 terms,
                 scores=None,
                 probs=None,
                 stem=False,
                 remove_stop=False):
        """scores: a list of scores, can be log value or probabilities
        terms: [['a', 'b', 'c', 'd'], ['e', 'f', 'g', 'h']] or
               ['a b c d', 'e f g h'] or
               [[['a', 0.4], ['b', 0.3], ['c', 0.2]], [['e', 0.5], ['f', 0.5]]]
        """
        if (scores and probs) or (not scores and not probs):
            raise Exception('One of scores and probs must be specified.')

        if scores and not probs:
            probs = [math.exp(s - scores[0]) for s in scores]

        probs = normalize(probs)

        stemmer = Stemmer()

        if isinstance(terms[0], str):
            terms = [s.split() for s in terms]
        else:
            assert hasattr(terms[0], '__iter__')

        if stem:
            terms = [list(map(stemmer.stem, s)) for s in terms]

        rm = {}
        for doc_prob, term_list in zip(probs, terms):
            length = len(term_list)
            for term, occur in Counter(term_list).items():
                rm.setdefault(term, 0.0)
                rm[term] += doc_prob * (occur / length)

        # Removing stop words must be after generating the
        # distribution because it changes document length.
        if remove_stop:
            rm = {t: p for t, p in rm.items() if t not in stopwords}
            rm = normalize(rm)

        self._rm = rm
Пример #27
0
def get_wiki_doc_lm(fulltext=False):
    print("Building LM from wikipedia controversy list")
    train_data = amsterdam.get_train_data(separate=True)
    pos_entries, neg_entries = train_data
    stemmer = Stemmer()

    def doc_rep(entry):
        return entry["title"] + "\t" + entry["content"]

    pos_docs = list(map(doc_rep, pos_entries))
    neg_docs = list(map(doc_rep, neg_entries))

    y = list(1 for _ in pos_docs) + list(0 for _ in neg_docs)
    all_docs = pos_docs + neg_docs

    tokenizer = lambda x: tokenize(x, set(), False)
    classifier = LMClassifer(tokenizer, stemmer, fulltext=True)
    classifier.build2(all_docs, y)
    return classifier
Пример #28
0
def tokenize(text, stemming=True, stoplist=None):
    kstemmer = Stemmer()
    translator = str.maketrans(
        string.punctuation,
        ' ' * len(string.punctuation))  # map punctuation to space
    text = text.translate(translator)
    text = text.lower()
    text = text.strip()
    table = str.maketrans({key: None for key in string.punctuation})
    text = text.translate(table)
    if stemming:
        analyzer = StemmingAnalyzer(stoplist=stoplist,
                                    minsize=2,
                                    stemfn=kstemmer.stem)
    else:
        analyzer = StandardAnalyzer(stoplist=stoplist, minsize=2)

    tokens = [token.text for token in analyzer(text)]
    tokens = [word for word in tokens if not contains_digits(word)]
    return tokens
Пример #29
0
def get_dbpedia_contrv_lm():
    print("Building LM from DBPedia's controversy ranked docs")

    stemmer = Stemmer()
    cont_docs = controversy.load_pseudo_controversy_docs("dbpedia")[:7500]
    print("Using {} docs".format(len(cont_docs)))
    tokenizer = lambda x: tokenize(x, set(), False)
    assert cont_docs[0][0] == 1

    print("Loading collection stats")
    bg_ctf, bg_tf = controversy.load_tf("tf_dump_100.txt")
    bg_ctf = sum(bg_tf.values())
    cont_docs_text = list([x[2] for x in cont_docs])
    print("Building LM classifier ")

    classifier = LMClassifer(tokenizer, stemmer, fulltext=False)
    classifier.build(
        cont_docs_text,
        bg_tf,
        bg_ctf,
    )
    return classifier
Пример #30
0
def get_wiki_doc():
    train_data = amsterdam.get_train_data(separate=True)
    pos_entries, neg_entries = train_data
    stemmer = Stemmer()

    def doc_rep(entry):
        return entry["title"] + "\t" + entry["content"]

    pos_docs = list(map(doc_rep, pos_entries))
    neg_docs = list(map(doc_rep, neg_entries))

    y = list(1 for _ in pos_docs) + list(0 for _ in neg_docs)
    all_docs = pos_docs + neg_docs

    tokenizer = lambda x: tokenize(x, set(), False)

    X = []
    voca = set()
    for doc in all_docs:
        tokens = tokenizer(doc)
        voca.update(tokens)
        X.append(tokens)
    return X, y, voca