Exemplo n.º 1
0
def filtrar_extremos(docs, max_freq=0.5, min_wordcount=2, n_top=3):
    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)
    dictionary.filter_n_most_frequent(n_top)
    _ = dictionary[0]

    return dictionary
Exemplo n.º 2
0
 def preprocess_dict(self,
                     dictionary: corpora.Dictionary) -> corpora.Dictionary:
     # TODOx decouple
     dictionary.filter_n_most_frequent(5)
     # fixmeX when NO_BELOW or NO_ABOVE change, this will not automatically recreate dict
     dictionary.filter_extremes(no_below=self.no_below,
                                no_above=self.no_above)
     return dictionary
 def set_dictionary(self, language_processed_data: list, no_below: int,
                    no_above: float, n_most_frequent: int,
                    dictionary_file_path):
     logging.info("---- Creating dictionary from processed data")
     dic = Dictionary(language_processed_data)
     dic.filter_n_most_frequent(n_most_frequent)
     dic.filter_extremes(no_below=no_below, no_above=no_above)
     dic.save(dictionary_file_path)
     self.dictionary = dic
     logging.info("---- Dictionary is created")
     return
Exemplo n.º 4
0
def lsa(corpus, size=8):
    dic = Dictionary(corpus)
    dic.filter_extremes(
        no_below=5,
        no_above=0.8,
    )
    dic.filter_n_most_frequent(remove_n=10)
    dic.compactify()
    index_corpus = [dic.doc2bow(sent) for sent in corpus]
    tfidf = TfidfModel(index_corpus, dictionary=dic)
    normed_corpus = [tfidf[sent] for sent in index_corpus]
    lsi = LsiModel(normed_corpus, num_topics=size)
    return [[x[1] for x in lsi[sent]] for sent in normed_corpus]
Exemplo n.º 5
0
def process_dict(train_texts, doc_len):
    dictionary = Dictionary(train_texts)
    print('dict size:', len(dictionary))
    # remove extremes
    no_below = int(doc_len * 0.008)
    filter_freq = int(doc_len * 0.2)
    print('no_below,filter_freq:', no_below, filter_freq)
    dictionary.filter_extremes(
        no_below=no_below)  # remove words in less 0.8% of documents
    dictionary.filter_n_most_frequent(
        filter_freq)  # Filter out 20% of most common word tokens
    # filter_tokens(bad_ids=None, good_ids=None)
    return dictionary
Exemplo n.º 6
0
def get_corpus_and_dict(data_path):
    print("[BLOCK] Getting corpus and dictionary files from %s" % (data_path))
    sys.stdout.flush()

    file_paths, files_list = get_lists(data_path)

    print("[BLOCK] Building dictionary with %s documents" % len(files_list))
    sys.stdout.flush()

    dictionary = Dictionary(files_list)

    print("[BLOCK] Filtering out %s (0.1)" % (int(len(dictionary) * 0.1)))
    sys.stdout.flush()

    dictionary.filter_n_most_frequent(int(len(dictionary) * 0.1))

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(doc) for doc in files_list]

    return corpus, dictionary
Exemplo n.º 7
0
def preprocess_docs(docs):
    '''Preprocess all the documents and create dict + corpus.'''

    # Pre-process the documents.

    docs = [preprocess_doc(doc) for doc in docs]

    # Remove rare and common tokens.

    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)

    # Remove 5 most frequent
    dictionary.filter_n_most_frequent(5)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return corpus, docs, dictionary
Exemplo n.º 8
0
 def testFilterMostFrequent(self):
     d = Dictionary(self.texts)
     d.filter_n_most_frequent(4)
     expected = {0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2}
     self.assertEqual(d.dfs, expected)
Exemplo n.º 9
0
 def testFilterMostFrequent(self):
     d = Dictionary(self.texts)
     d.filter_n_most_frequent(4)
     expected = {0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2}
     self.assertEqual(d.dfs, expected)
Exemplo n.º 10
0
def create_dict(corpus, NUM_TOPICS=5, filter_n_most_freq=10):
    dictionary = Dictionary(corpus)
    dictionary.filter_extremes(no_below=round(0.1/NUM_TOPICS*dictionary.num_docs))
    dictionary.filter_n_most_frequent(10)
    return dictionary
Exemplo n.º 11
0
    texts = [[token for token in line if not token.isnumeric()] for line in texts]
    # Remove words that are only two or less characters.
    texts = [[token for token in line if len(token) > 2] for line in texts]
    #Lemmatization (not stem since stemming can reduce the interpretability).
    lemmatizer = WordNetLemmatizer()

    texts = [[word for word in lemmatizer.lemmatize(' '.join(line), pos='v').split()] for line in texts]
    return texts

train_texts = process_texts(train_texts)
print('bigramed train_texts',len(train_texts))
dictionary = Dictionary(train_texts)
print('dict size:',len(dictionary))
#remove extremes
dictionary.filter_extremes(no_below=10, no_above=0.1) #remove words in less than 5 documents and more than 50% documents
dictionary.filter_n_most_frequent(2000) #Filter out 1000 most common tokens
#filter_tokens(bad_ids=None, good_ids=None)
corpus = [dictionary.doc2bow(text) for text in train_texts]
print('corpus size:',len(corpus))
coherences = []
#LSI
'''
lsimodel = LsiModel(corpus=corpus, num_topics=1, id2word=dictionary)
#print(lsimodel.show_topics(num_topics=5)) # Showing only the top 5 topics
lsitopics = lsimodel.show_topics(formatted=False)
lsitopics = [[word for word, prob in topic] for topicid, topic in lsitopics]
lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()
print('LSI:',lsi_coherence)
coherences.append(lsi_coherence)
#LDA
ldamodel = LdaModel(corpus=corpus, num_topics=1, id2word=dictionary)
Exemplo n.º 12
0
    ion = []
    for w in i:
        if not w.is_stop and not w.is_punct and not w.like_num and w.text != 'I':
            ion.append((w.lemma_))
    txts.append(ion)
bigram = gensim.models.Phrases(txts)
txts = [bigram[line] for line in txts]
dictionary = Dictionary(txts)
# dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in txts]

Counter(txts[1]).most_common(20)

len(dictionary)

dictionary.filter_n_most_frequent(2)
len(dictionary)

gensim.corpora.MmCorpus.serialize(
    "D:/Google Drive/BAP/text_analysis/corpus.mm", corpus)  ###SAVE

corpus = gensim.corpora.MmCorpus(
    "D:/Google Drive/BAP/text_analysis/corpus.mm")  ###LOAD
print(list(
    corpus))  # calling list() will convert any sequence to a plain Python list

print(corpus)
for doc in corpus:
    print(doc)

from gensim.models import LdaModel, LsiModel, HdpModel
Exemplo n.º 13
0
import jieba
import time

time_start = time.time()
# 数据预处理
with open("toutiao_cat_data.txt", "r", encoding="utf-8") as f:
    # with open("test.txt","r",encoding="utf-8") as f:
    data = []
    for line in f.readlines():
        line = line.strip()  # 去除空格
        line = ','.join(line.split("_!_")[3:])  # 按符号切割数据,并且不要前三个无关文本内容的数据
        data.append(jieba.lcut(line))

# 文本向量化
dictionary = Dictionary(data)  # 统计每个词在其它文本中出现了多少次
dictionary.filter_n_most_frequent(200)  # 过滤掉频率过高的词
corpus = [dictionary.doc2bow(text) for text in data]  # 转化为词袋向量

# 训练模型
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)  # 指定了10个主题,

# 获取主题词分布
topic_list = lda.print_topics(20)
# print(topic_list)
for i in topic_list:
    print(i)


def pre(data):
    '获取某篇文档的主题分布'
    print(data)
Exemplo n.º 14
0
class FasttextTfIdfTransformer:
    def __init__(self,
                 model=None,
                 dictionary=None,
                 corpus_file=None,
                 size=256,
                 window=7,
                 min_count=4,
                 iter=30,
                 min_n=4,
                 max_n=5,
                 word_ngrams=1,
                 no_above=0.5,
                 filter_n_most_frequent=100,
                 do_filter_tokens=True,
                 workers=multiprocessing.cpu_count() - 1,
                 ft_prefix="ft_",
                 token_column=None,
                 inplace=True,
                 store_train_data=False,
                 skip_fit=False,
                 skip_transform=False,
                 normalize_word_vectors=True):
        self.size = size
        self.window = window
        self.min_count = min_count
        self.iter = iter
        self.min_n = min_n
        self.max_n = max_n
        self.word_ngrams = word_ngrams
        self.workers = workers
        self.token_column = token_column
        self.model = None
        assert type(self.token_column) == str
        self.ft_prefix = ft_prefix
        self.skip_fit = skip_fit
        self.skip_transform = skip_transform
        self.inplace = inplace
        self.normalize_word_vectors = normalize_word_vectors
        self.store_train_data = store_train_data
        self.train = None
        self.model = model
        self.no_above = no_above
        self.word_set = None
        self.filter_n_most_frequent = filter_n_most_frequent
        self.do_filter_tokens = do_filter_tokens
        self.dictionary = dictionary
        if model is None and corpus_file is not None:
            self.dictionary = Dictionary(
                map(lambda s: s.split(), load_list_per_line(corpus_file)))
            print("Total Unique Tokens = %s" % (len(self.dictionary)))
            self.dictionary.filter_extremes(no_below=self.min_count,
                                            no_above=self.no_above,
                                            keep_n=1000000)
            self.dictionary.filter_n_most_frequent(self.filter_n_most_frequent)
            print("Total Unique Tokens after filtering = %s" %
                  (len(self.dictionary)))
            self.word_set = set(self.dictionary.values())
            self.model = FastText(corpus_file=corpus_file,
                                  size=self.size,
                                  window=self.window,
                                  min_count=self.min_count,
                                  iter=self.iter,
                                  min_n=self.min_n,
                                  max_n=self.max_n,
                                  word_ngrams=self.word_ngrams,
                                  workers=self.workers,
                                  bucket=8000000,
                                  alpha=0.03,
                                  negative=10,
                                  ns_exponent=0.5)

        if (model is None or dictionary is None) and corpus_file is None:
            raise ValueError("No data given to initialise FastText Model")
        assert self.dictionary is not None and self.model is not None

    def fit(self, X, y='ignored'):
        gc.collect()
        if self.store_train_data:
            self.train = (X, y)
        if self.skip_fit:
            return self
        if type(X) == pd.DataFrame:
            X = X[self.token_column].values
        else:
            raise ValueError()

        assert self.dictionary is not None and self.model is not None

        self.dictionary.add_documents(X)
        dct = self.dictionary
        print("Total Unique Tokens = %s" % (len(dct)))
        dct.filter_extremes(no_below=self.min_count,
                            no_above=self.no_above,
                            keep_n=1000000)
        dct.filter_n_most_frequent(self.filter_n_most_frequent)
        print("Total Unique Tokens after filtering = %s" % (len(dct)))
        self.word_set = set(dct.values())

        print("FastText Modelling Started at %s" % (str(pd.datetime.now())))
        self.model.build_vocab(X, update=True)
        self.model.train(X,
                         total_examples=self.model.corpus_count,
                         epochs=self.model.epochs)
        print("FastText Modelling done at %s" % (str(pd.datetime.now())))
        print("FastText Vocab Length = %s, Ngrams length = %s" % (len(
            self.model.wv.vectors_ngrams), len(self.model.wv.vectors_vocab)))

        gc.collect()
        return self

    def fit_stored(self):
        X, y = self.train
        return self.fit(X, y)

    def partial_fit(self, X, y=None):
        self.fit(X, y='ignored')

    def transform_one(self, token_array):
        tokens2vec = [
            self.model.wv[token] if token in self.model.wv else np.full(
                self.size, 0) for token in token_array
        ]
        if np.sum(tokens2vec) == 0:
            return np.full(self.size, 0)
        return np.average(tokens2vec, axis=0)

    def transform(self, X, y='ignored'):
        print("Fasttext Transforms start at: %s" % (str(pd.datetime.now())))
        if self.skip_transform:
            return X
        if type(X) == pd.DataFrame:
            Input = X[self.token_column].values
        else:
            raise ValueError()
        if not self.inplace:
            X = X.copy()

        uniq_tokens = set(more_itertools.flatten(Input))
        print("Number of Unique Test Tokens for Fasttext transform %s" %
              len(uniq_tokens))
        if self.do_filter_tokens:
            uniq_tokens = uniq_tokens.intersection(self.word_set)
        print(
            "Number of Unique Test Tokens after filtering for Fasttext transform %s"
            % len(uniq_tokens))
        empty = np.full(self.size, 0)
        token2vec = {
            k: self.model.wv[k] if k in self.model.wv else empty
            for k in uniq_tokens
        }
        token2vec = {k: v / np.linalg.norm(v) for k, v in token2vec.items()}

        def tokens2vec(token_array):
            empty = np.full(self.size, 0)
            if len(token_array) == 0:
                return empty
            return [
                token2vec[token] if token in uniq_tokens else empty
                for token in token_array
            ]

        ft_vecs = list(map(tokens2vec, Input))

        results = list(
            map(
                lambda x: np.average(
                    x,
                    axis=0,
                ) if np.sum(x) != 0 else np.full(300, 0), ft_vecs))

        text_df = pd.DataFrame(list(map(list, results)))
        text_df.columns = [
            self.ft_prefix + str(i) for i in range(0, self.size)
        ]
        text_df.index = X.index
        X[list(text_df.columns)] = text_df
        gc.collect()
        print("Fasttext Transforms done at: %s" % (str(pd.datetime.now())))
        return X

    def inverse_transform(self, X, copy=None):
        raise NotImplementedError()

    def fit_transform(self, X, y='ignored'):
        self.fit(X)
        return self.transform(X)