def tfidf_filter(dataset, threshold): tokens = [] #print('tokenizing documents...') for doc in dataset: #doc = clean_text(doc) tokenize = regTokenize(doc) tokens.append(tokenize) #print('creating dictionary...') dct = Dictionary(tokens) corpus = [dct.doc2bow(line) for line in tokens] #print(len(corpus)) #print('creating tf-idf model...') model = TfidfModel(corpus, id2word=dct) low_value_words = [] for bow in corpus: low_value_words += [ id for id, value in model[bow] if (value < threshold) ] #and dct[id] != "reforma_tributaria")] #print("low_value_words:",len(low_value_words)) dct.filter_tokens(bad_ids=low_value_words) new_corpus = [dct.doc2bow(doc) for doc in tokens] #print(len(new_corpus)) corp = [] for doc in new_corpus: corp.append([dct[id] for id, value in doc]) return corp
def remove_rare_often_word(texts, low_value, high_value): #removing frequent and rare words texts_tokenized = [simple_preprocess(doc) for doc in texts] dictionary = Dictionary(texts_tokenized) corpus = [dictionary.doc2bow(doc) for doc in texts_tokenized] tfidf = TfidfModel(corpus, id2word=dictionary) corpus_tfidf = tfidf[corpus] bad_words = [] for sent_tfidf in tqdm(corpus_tfidf, desc="selecting bad words"): bad_words += [ id for id, value in sent_tfidf if (value < low_value) or (value > high_value) ] dictionary.filter_tokens(bad_ids=bad_words) out_bow = [dictionary.doc2bow(doc) for doc in texts_tokenized] out_corpus = [] for doc in tqdm(out_bow, desc='Creating out corpus'): out_corpus.append([dictionary.get(id) for id, value in doc]) dict_tfidf = { dictionary.get(id): value for doc in corpus_tfidf for id, value in doc if (value >= low_value) and (value <= high_value) } return { 'texts': out_corpus, 'dict_tfidf': dict_tfidf, 'dictionary': dictionary }
class MiCorpus: """ Iterable: en cada iteración devuelve vectores bag-of-words, uno por documento. Procesa un documento a la vez usando generators. Nunca carga todo el corpus a RAM. """ def __init__(self, directorio, lenguaje, otros=None): self.directorio = directorio self.lenguaje = lenguaje self.otros = otros self.ngramas = model_ngrams( iter_sentences(self.directorio, self.lenguaje, self.otros)) self.diccionario = Dictionary( iter_documents(self.ngramas, self.directorio, self.lenguaje, self.otros)) self.diccionario.filter_extremes(no_above=0.8) self.diccionario.filter_tokens( bad_ids=(tokid for tokid, freq in self.diccionario.dfs.items() if freq == 1)) self.diccionario.compactify() def __iter__(self): """ CorpusConsultivos es un streamed iterable. """ for tokens in iter_documents(self.ngramas, self.directorio, self.lenguaje, self.otros): yield self.diccionario.doc2bow(tokens)
def texts2corpus(documents, tfidf=False, stopwords=None, filter_below=5, filter_above=0.5, keep_n=100000, logg=print): logg(f'generating {"tfidf" if tfidf else "bow"} corpus and dictionary') dictionary = Dictionary(documents, prune_at=None) dictionary.filter_extremes(no_below=filter_below, no_above=filter_above, keep_n=keep_n) # filter some noice (e.g. special characters) if stopwords: stopword_ids = [dictionary.token2id[token] for token in stopwords] dictionary.filter_tokens(bad_ids=stopword_ids, good_ids=None) bow_corpus = [dictionary.doc2bow(text) for text in documents] if tfidf: tfidf_model = TfidfModel(bow_corpus) corpus = tfidf_model[bow_corpus] else: corpus = bow_corpus return corpus, dictionary
def testFilterTokens(self): self.maxDiff = 10000 d = Dictionary(self.texts) removed_word = d[0] d.filter_tokens([0]) expected = { 'computer': 0, 'eps': 8, 'graph': 10, 'human': 1, 'interface': 2, 'minors': 11, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'trees': 9, 'user': 7 } del expected[removed_word] self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys())) expected[removed_word] = len(expected) d.add_documents([[removed_word]]) self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))
def _build_vocab(self, max_vocab_cnt): all_words = [] for data in self.valid + self.non_valid: all_words.append(data["title"] + data["content"]) vocab = Dictionary(all_words) raw_vocab_size = len(vocab) vocab.filter_extremes(no_below=5) vocab.filter_extremes(keep_n=max_vocab_cnt) len_1_words = list( filter( lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w not in ["a", "i"] and True or False, vocab.values())) vocab.filter_tokens(list(map(vocab.token2id.get, len_1_words))) if self.config.use_dict == "seq" and self.config.enable_pad: vocab.token2id[PAD] = len(vocab) vocab.compactify() self.pad_wid = vocab.token2id.get(PAD) self.vocab_seq = vocab # seq dictionary # build bow dictionary self.vocab_bow = copy.deepcopy(vocab) self.vocab_bow.filter_tokens( map(self.vocab_bow.token2id.get, STOPWORDS)) # filter stop words self.vocab_bow.compactify() if self.config.tfidf: tfidf_corpus = [self.vocab_bow.doc2bow(line) for line in all_words] self.tfidf_model = TfidfModel(tfidf_corpus) print("Load corpus with non_valid size %d, valid size %d, " "raw vocab size %d seq vocab size %d, bow vocab size %d" % (len(self.non_valid), len(self.valid), raw_vocab_size, len(self.vocab_seq), len(self.vocab_bow)))
def prepare_corpus(tweets_file, corpus_file, dictionary_file, author_topic): stop_words = set(stopwords.words('english')) stop_words.add(u'rt') print('Loading tweets from ' + tweets_file) tweets = pd.read_pickle(tweets_file) if author_topic: tweets = tweets.groupby('user').agg({'text': 'sum'}) print('%d tweets loaded' % len(tweets.index)) dictionary = Dictionary(tweets['text']) stopword_ids = map(dictionary.token2id.get, stop_words) dictionary.filter_tokens(stopword_ids) dictionary.compactify() dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None) dictionary.compactify() corpus = [dictionary.doc2bow(doc) for doc in tweets['text']] # print(corpus) print("Writing corpus to " + corpus_file) MmCorpus.serialize(corpus_file, corpus) # print(dictionary) print("Writing dictionary to " + dictionary_file) dictionary.save(dictionary_file)
class process_corpus(object): def __init__(self, sql=None,lemmatize=False,first_sentences=False,n_sentences=10): self.sql=sql self.first_sentences=first_sentences self.n_sentences=n_sentences self.wordnet=WordNetLemmatizer() self.pstemmer=PorterStemmer() self.lemmatize=lemmatize self.dictionary = Dictionary(self.iterrecords()) print('dictionary before:', self.dictionary.token2id) once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1] self.dictionary.filter_tokens(once_ids) self.dictionary.compactify() print('dictionary after filtering:', self.dictionary.token2id) def __iter__(self): self.cl=0 for tokens in self.iterrecords(): # generates the document tokens and creates bow using dictionary self.cl+=1 yield self.dictionary.doc2bow(tokens) def iterrecords(self): # generates document tokens for the dictionary self.index=[] cursor.execute(self.sql) ct=0 for doc in cursor: print ct self.index.append(str(doc[0]).strip()) doc=doc[1] # print to_beautiful(doc[1]) if self.first_sentences: doc=get_first_n_sentences_from_document(doc,self.n_sentences) tokens=clean_text_by_word(doc) ct+=1 yield tokens # or whatever tokenization suits you def __len__(self): return self.cl
def fetch_dict(): global dictionary dictionary=Dictionary([i for i in my_dictionary]) once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() dictionary.save("Topic/dic.loc") return dictionary
def pipline(data: DataFrame): if os.path.isfile(cache_path('run/' + _ARGS.name)): corpus, dictionary, documents = load_cache('run/' + _ARGS.name) elif data: documents = data['tokens'].to_list() # Create a dictionary representation of the documents. dictionary = Dictionary(documents) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=20, no_above=0.5) # 去停用词 bad_ids = [dictionary.token2id[t] for t in STOP_WORDS if t in dictionary.token2id] dictionary.filter_tokens(bad_ids=bad_ids) # Bag-of-words representation of the documents. corpus = [dictionary.doc2bow(doc) for doc in documents] dump_cache((corpus, dictionary, documents), 'run/' + _ARGS.name) else: raise ValueError('cache不存在且未传入data') _ = dictionary[0] # This is only to "load" the dictionary. output('Number of unique tokens: ', len(dictionary)) output('Number of documents: ', len(corpus)) # test = get_model(6, corpus, dictionary.id2token) topic_range = tuple(int(s.strip()) for s in _ARGS.range.split(',')) kwargs = dict( id2word=dictionary.id2token, chunksize=len(corpus), passes=_ARGS.passes, alpha='auto', eta='auto', eval_every=1, iterations=_ARGS.iterations, random_state=123) if len(corpus) < 1e6: # 并行训练模型 pool = Pool(_ARGS.pool_size) result_dict = dict() for k in range(*topic_range): result_dict[k] = pool.apply_async(get_model, (corpus, k, kwargs)) result_dict = {k: v.get() for k, v in result_dict.items()} pool.close() # 等子进程执行完毕后关闭进程池 pool.join() output(f"Searched range{topic_range}") # 计算一致性的代码自己有多进程,所以只能串行 for k, (model, ids) in result_dict.items(): eval_and_write(data, k, documents, dictionary, corpus, model, ids) else: # kwargs['alpha'] = 'symmetric' kwargs['chunksize'] = len(corpus) // 8 // _ARGS.pool_size + 1 # kwargs['batch'] = True for k in range(*topic_range, 2): # 大数据就粗点筛 # model = LdaMulticore(corpus, k, workers=_ARGS.pool_size, **kwargs) model = LdaModel(corpus, k, **kwargs) ids = save_and_inference(model, corpus, k, kwargs['chunksize']) # result_dict[k] = (model, ids) # 内存不够用啊,4M句子 eval_and_write(None, k, documents, dictionary, corpus, model, ids) del model, ids gc.collect() output(f"===> {_ARGS.name} compete. \n")
def fetch_dict(): global dictionary dictionary = Dictionary([i for i in my_dictionary]) once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1 ] dictionary.filter_tokens(once_ids) dictionary.compactify() dictionary.save("Topic/dic.loc") return dictionary
def prep_corpus(docs, additional_stopwords=set(), no_below=2, no_above=0.05): dictionary = Dictionary(docs) stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) dictionary.filter_tokens(stopword_ids) dictionary.compactify() dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
def Gensim_Dic(sentences, tem_fname): dct = Dictionary(sentences) a = [] for w in stopwords: if w in dct.token2id.keys(): a.append(dct.token2id[w]) dct.filter_extremes(no_below=10) dct.filter_tokens(bad_ids=a) dct.compactify() dct.save_as_text(tmp_fname)
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5): print('Building dictionary...') dictionary = Dictionary(docs) stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) dictionary.filter_tokens(stopword_ids) dictionary.compactify() dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
def __prep_dict(self, doc): def nltk_stopwords(): return set(nltk.corpus.stopwords.words('english')) additional_stopwords = [ 'nbsp', '.', ',', '"', "'", '?', '!', '>', ':', ';', '(', ')', '[', ']', '{', '}', '/', '.com' ] dictionary = Dictionary(doc) stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) dictionary.filter_tokens(stopword_ids) dictionary.compactify() return dictionary.doc2bow(doc)
def fetch_dict(): print "Fetching Dictionary...", try: dictionary=Dictionary().load("Topic/dic.tm") print "Dictionary loaded!" except IOError: print "Dictionary not found, building Dictionary..." dictionary=Dictionary(i for i in MyDictionary()) once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() print "\rDictionary Built!" print dictionary dictionary.save("Topic/dic.tm") return dictionary
def extract_topics(words): word_id_map=Dictionary([words]) word_id_map.filter_tokens([id for id, occurance in word_id_map.dfs.iteritems() if occurance == 2]) word_id_map.compactify() deals_corpus=[word_id_map.doc2bow(words)] lda=LdaModel(corpus=deals_corpus, id2word=word_id_map, num_topics=15, update_every=1, chunksize=1000,passes=1) topics=[] for i in range(15): tokens=lda.print_topic(i).split('+') topic_scores=[] for token in tokens: score,token_val=token.split('*') topic_scores.append((token_val,score)) topics.append(topic_scores) return topics
def testFilterTokens(self): self.maxDiff = 10000 d = Dictionary(self.texts) removed_word = d[0] d.filter_tokens([0]) expected = {'computer': 0, 'eps': 8, 'graph': 10, 'human': 1, 'interface': 2, 'minors': 11, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'trees': 9, 'user': 7} del expected[removed_word] self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys())) expected[removed_word] = len(expected) d.add_documents([[removed_word]]) self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))
def fetch_dict(): print "Fetching Dictionary...", try: dictionary = Dictionary().load("Topic/dic.tm") print "Dictionary loaded!" except IOError: print "Dictionary not found, building Dictionary..." dictionary = Dictionary(i for i in MyDictionary()) once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1 ] dictionary.filter_tokens(once_ids) dictionary.compactify() print "\rDictionary Built!" print dictionary dictionary.save("Topic/dic.tm") return dictionary
def execute(self, data, passes=10): wordlists = [corpus.contents.lower().split() for corpus in data] stoplist = stopwords.words('english') dictionary = Dictionary(wordlists) # Remove stop words and words that appear too much or too little stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] dictionary.filter_tokens(stop_ids) dictionary.filter_extremes(no_below=2, no_above=0.2) bags_of_words = [dictionary.doc2bow(t) for t in wordlists] # This can take a while to run: lda = LdaModel(bags_of_words, id2word=dictionary, num_topics=self.num_topics, passes=passes) results = self.assemble_topics(lda) return results
def get_topics_lda(tokens, n_topics=10): """ Using the `gensim` package for LDA. LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia). `gensim` is a package for topic modeling only. So for a particular topic modeling task, it is a lighter option to install and run. Also it can be run distributed and updated over an existing model :param tokens: Preprocessed tokens for faster dictionary building :param n_topics: Number of topics to decompose data to :return: list() of topics """ dict_file = 'resources/deals.dict' if not os.path.isfile(dict_file): print "Dictionary file does not exist. Creating one" dictionary = Dictionary(tokens) freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1] dictionary.filter_tokens(freq1) dictionary.compactify() dictionary.save(dict_file) dictionary = Dictionary.load(dict_file) # print dictionary corpus_file = 'resources/deals.mm' if not os.path.isfile(corpus_file): print "Corpus file does not exist. Creating one" corpus = [dictionary.doc2bow(token) for token in tokens] MmCorpus.serialize(corpus_file, corpus) mm = MmCorpus(corpus_file) # print mm # tfidf = TfidfModel(mm) # corpus_tfidf = tfidf[mm] lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000, passes=1) topics = [] for i in range(0, n_topics): words = lda.print_topic(i).split('+') topic = [] for word in words: score, w = word.split('*') topic.append((w, score)) topics.append(topic) return topics
def prepare_word_embedding(): """Construct vocabulary file and word embedding file. """ df = pd.read_csv( "data/raw/train.csv", usecols=["original_phrase1", "original_phrase2", "ytrue"] ) model = KeyedVectors.load_word2vec_format( "/data/mayu-ot/Data/Model/GoogleNews-vectors-negative300.bin.gz", binary=True ) CUSTOM_FILTERS = [ lambda x: x.lower(), strip_punctuation, strip_multiple_whitespaces, strip_numeric, ] doc = [preprocess_string(x, CUSTOM_FILTERS) for x in df.values[:, :2].ravel()] dct = Dictionary(doc) bad_ids = [] for k, v in dct.iteritems(): if v not in model: bad_ids.append(k) dct.filter_tokens(bad_ids) dct.compactify() for k, v in dct.iteritems(): print(k, v) if k == 10: break dct.save_as_text("data/processed/dictionary.txt") word_emb = np.ones((len(dct), 300)) for k, v in dct.iteritems(): word_emb[k] = model[v] np.save("data/processed/word2vec", word_emb)
def train_lda(self, df, n_topics, min_count=2, labels=None, tag=False): """ Learn an LDA topic model from input data using gensim :param df: :param n_topics: :param min_count: :return: """ #Save class labels if necessary if labels != None: y = df.loc[:, labels].values #Clean and find phrases df = read_clean(df, phraser=self.phrases) # Get gensim dictionary, remove function words and infrequent words common_dictionary = Dictionary(df) common_dictionary.filter_extremes(no_below=min_count) remove_ids = [common_dictionary.token2id[x] for x in self.function_words_single if x in common_dictionary.token2id] # Filter out words we don't want common_dictionary.filter_tokens(bad_ids=remove_ids) common_corpus = [common_dictionary.doc2bow(text) for text in df] # Train LDA lda = LdaModel(common_corpus, num_topics=n_topics, distributed=False, passes=10, iterations=10, ) # Save to class self.lda = lda self.lda_dictionary = common_dictionary ai_logger.debug("Done learning LDA model") #If necessary, annotate the corpus as well if tag==True: tag_df = self.use_lda(df, y, cleaned=True) return tag_df
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5): print('Building dictionary...') dictionary = Dictionary(docs) # remove stopwords stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(stopword_ids) dictionary.compactify() # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(shortword_ids) dictionary.compactify() # remove words that appear only once once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() # filter extreme values dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
def build_dictionary(text_file, stop_words): """ This function takes a text file and a file of stop words, and builds dictionary with pairs of word indexes and word counts in every paragraph. :param text_file: Input text file :param stop_words: Text file of stop words :return: Corpus object (=list of paragraphs); each paragraph is a list of pairs (word-index, word-count) """ words, paragraphs = process_text(text_file) dictionary = Dictionary(words) # Gather all stop words with codecs.open(stop_words, "r", "utf-8") as stop_w: stop_words = stop_w.read().split(',') # Gather all stop word ids stop_word_ids = [] for i in range(len(dictionary)): if dictionary[ i] in stop_words: # Check if stop word exists in dictionary stop_word_ids.append(dictionary.token2id[dictionary[i]]) dictionary.filter_tokens(stop_word_ids) # Filter out all stop words bags_of_words = [] printProgressBar(0, len(words), prefix='Building dictionary:', suffix='Complete', length=50) for i in range(len(words)): printProgressBar(i + 1, len(words), prefix='Building dictionary:', suffix='Complete', length=50) bags_of_words.append(dictionary.doc2bow(words[i])) return bags_of_words, dictionary, paragraphs
def prep_text_lda(docs, vocab_size=20000): """ docs: (pd.Series str) cleaned text """ english_stopwords = set([s.replace("\'", "") for s in stopwords.words("english")]) tqdm.pandas(desc="Tokenizing") tokenized_docs = docs.progress_apply(lambda x: [w.lower() for w in tokenize(x)]) bigram = Phrases(tokenized_docs.values.tolist()) phraser = Phraser(bigram) tqdm.pandas(desc="Bigrams") bigrammed_docs = tokenized_docs.progress_apply(lambda tokens_: phraser[tokens_]) id2word = Dictionary(bigrammed_docs.values.tolist()) id2word.filter_extremes(keep_n=vocab_size, no_above=0.5) id2word.filter_tokens(bad_ids=[id2word.token2id[a] for a in english_stopwords if a in id2word.token2id]) id2word.compactify() tqdm.pandas(desc="Cleaning") tokenized = bigrammed_docs.progress_apply(lambda doc_tokens: " ".join([w for w in doc_tokens if w in id2word.token2id])) reconst_docs = tokenized.apply(lambda x: x.split()) return id2word, reconst_docs
def __init__(self, directory=None, dictionary=None, distributions=None, corpus=None, max_docs=None): if directory: docs = self.get_docs(directory, distributions, max_docs) if not dictionary: """ Construct dictionary without having all texts in memory, based off the example in the Gensim docs""" dictionary = Dictionary(filter_common(codecs.open(doc, encoding='utf-8').read().lower().split()) for doc in docs) once_words = [id for id, freq in dictionary.dfs.iteritems() if freq is 1] dictionary.filter_tokens(once_words) # Exclude if appears once dictionary.compactify() # Remove gaps in ids left by removing words dictionary.filter_extremes(no_below=20, no_above=0.75, keep_n=None) # Filter if in less than 20 docs and if in more than 75% self.dictionary = dictionary else: self.dictionary = Dictionary.load(dictionary) self.docs = PaperCorpus(docs) elif dictionary and corpus: self.dictionary = Dictionary.load(dictionary) self.docs = MmCorpus(corpus) else: self.dictionary = Dictionary([]) self.docs = PaperCorpus([]) self.transformation = IdentityTransformation() self.train_time = None self.sim_index = None return
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5): print('Building dictionary...') dictionary = Dictionary(docs) # remove stopwords stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) # get ids for short words len(word)<=3 shortword_ids = [ tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0]) <= 3 ] dictionary.filter_tokens(stopword_ids) dictionary.compactify() # get ids for short words len(word)<=3 shortword_ids = [ tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0]) <= 3 ] dictionary.filter_tokens(shortword_ids) dictionary.compactify() # remove words that appear only once once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1 ] dictionary.filter_tokens(once_ids) dictionary.compactify() # filter extreme values dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
mecab = MeCab.Tagger("-Owakati") # 辞書に含めない単語たち words_blacklist = [ ">>", # チャットのアノテーション "some_agent", "\u3000", # 全角スペースを意味している "。", "、", ] dct = Dictionary() # csvファイルの読み込み df = pd.read_csv(filepath, delimiter=",", names=["talker", "words", "type"]) # 文を分かち書き -> 半角スペースで区切り -> 最後の1文字(改行コード)を消したリストを得る wakati_df = df["words"].map(lambda x: mecab.parse(x).split(" ")[:-1]) # 辞書に追加 dct.add_documents(wakati_df) # ブラックリストの辞書内でのidを得る words_blacklist_id = dct.doc2idx(words_blacklist) # 辞書から削除 dct.filter_tokens(bad_ids=words_blacklist_id) #dct.filter_n_most_frequent(600) # 辞書の保存 dct.save(os.path.join(filedir, ".".join([filename, "dict"]))) # 辞書の中身と単語数の表示 print(dct.token2id) print(len(dct.token2id))
class CMVCorpus(object): logger = logging.getLogger(__name__) def __init__(self, config): self.config = config self._path = config.data_dir[0] self.max_data_size = config.max_data_size self.max_utt_len = config.max_utt_len self.tokenize = get_chat_tokenize() self.train_corpus, self.test_corpus = self._read_file( os.path.join(self._path)) self._build_vocab(config.max_vocab_cnt) print("Done loading corpus") def _process_dialog(self, data): new_dialog = [] all_lens = [] all_dialog_lens = [] for raw_dialog in data: dialog = { "title": self.tokenize(raw_dialog['title'].lower()), "op": self.tokenize(raw_dialog["content"].lower()), "pos_conv_lst": [], "neg_conv_lst": [] } for i, turns in enumerate( raw_dialog['comments']): # for each comment lst if turns["win"]: conv_lst = dialog["pos_conv_lst"] else: conv_lst = dialog["neg_conv_lst"] new_utt_lst = [] for turn in turns["utt_lst"]: argument = self.tokenize(turn.lower()) all_lens.append(len(argument)) new_utt_lst.append(argument) conv_lst.append(new_utt_lst) all_dialog_lens.append(len(new_utt_lst)) new_dialog.append(dialog) # cut for the max data size if len(new_dialog) >= self.max_data_size: break print("Max utt len %d, mean utt len %.2f" % (np.max(all_lens), float(np.mean(all_lens)))) print("Max dialog len %d, mean dialog len %.2f" % (np.max(all_dialog_lens), float(np.mean(all_dialog_lens)))) return new_dialog def _build_vocab(self, max_vocab_cnt): all_words = [] for dialog in self.train_corpus: all_words.append(dialog["op"] + dialog["title"]) for turns in dialog["pos_conv_lst"] + dialog["neg_conv_lst"]: for turn in turns: all_words.append(turn) self.vocab_bow = Dictionary(all_words) raw_vocab_size = len(self.vocab_bow) raw_wc = np.sum(list(self.vocab_bow.dfs.values())) # build useless stopwords vocab (e.g, very few words, single ascii words, some punctuation ,."') self.vocab_bow.filter_extremes(no_below=10, keep_n=max_vocab_cnt) bad_ids = HTML_STOPWORDS + ['cmv'] self.vocab_bow.filter_tokens( list(map(self.vocab_bow.token2id.get, bad_ids))) self.vocab_bow.compactify() self.vocab_seq = copy.deepcopy(self.vocab_bow) # for sequence model self.vocab_seq.token2id[self.vocab_seq[0]] = len(self.vocab_seq) self.vocab_seq.token2id[PAD] = 0 self.vocab_seq.token2id[UNK] = len(self.vocab_seq) self.vocab_seq.compactify() self.pad_wid = self.vocab_seq.token2id.get(PAD) len_1_words = list( filter( lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w not in ["[", "]", "$", "?", "!", "\"", "'", "i", "a" ] and True or False, self.vocab_bow.values())) self.vocab_bow.filter_tokens( list(map(self.vocab_bow.token2id.get, len_1_words))) # some makeup words # makeup_lst = [PAD] # for w in makeup_lst: # self.vocab_bow.token2id[w] = len(self.vocab_bow) # self.vocab_bow.compactify() # self.pad_wid = self.vocab_bow.token2id.get(PAD) # here we keep stopwords and some meaningful punctuations non_stopwords = filter( lambda w: re.match(r"^[\w\d_-]*$", w) and w not in STOPWORDS and True or False, self.vocab_bow.values()) self.vocab_bow_stopwords = copy.deepcopy(self.vocab_bow) self.vocab_bow_stopwords.filter_tokens( map(self.vocab_bow_stopwords.token2id.get, non_stopwords)) self.vocab_bow_stopwords.compactify() self.vocab_bow_non_stopwords = copy.deepcopy(self.vocab_bow) self.vocab_bow_non_stopwords.filter_tokens( map(self.vocab_bow_non_stopwords.token2id.get, self.vocab_bow_stopwords.values())) self.vocab_bow_non_stopwords.compactify() remain_wc = np.sum(list(self.vocab_bow.dfs.values())) min_count = np.min(list(self.vocab_bow.dfs.values())) # create vocabulary list sorted by count print( "Load corpus with train size %d, " "test size %d raw vocab size %d vocab size %d at cut_off %d OOV rate %f" % (len(self.train_corpus), len(self.test_corpus), raw_vocab_size, len(self.vocab_bow), min_count, 1 - float(remain_wc) / raw_wc)) def _read_file(self, path): with open(path, 'r') as f: data = json.load(f) return self._process_dialog(data["train"]), self._process_dialog( data["test"]) def _sent2id_seq(self, sent, vocab): return list( filter(lambda x: x is not None, [vocab.token2id.get(t) for t in sent])) def _sent2id_bow(self, sent, vocab): if sent: return vocab.doc2bow(sent) else: return [] def _to_id_corpus(self, data, vocab_seq, vocab_bow): results = [] word_cnt = 0 msg_cnt = 0 for dialog in data: # convert utterance and feature into numeric numbers id_dialog = Pack(title=self._sent2id_seq(dialog["title"], vocab_seq), op=self._sent2id_seq(dialog["op"], vocab_seq), pos_conv_seq_lst=[], pos_conv_bow_lst=[], neg_conv_seq_lst=[], neg_conv_bow_lst=[]) for turns in dialog["pos_conv_lst"]: new_turns_bow = [] new_turns_seq = [] for turn in turns: id_turn_seq = self._sent2id_seq(turn, vocab_seq) id_turn_bow = self._sent2id_bow(turn, vocab_bow) if id_turn_seq and id_turn_bow: # filter empty utt new_turns_bow.append(id_turn_bow) new_turns_seq.append(id_turn_seq) word_cnt += len(id_turn_seq) msg_cnt += 1 if new_turns_seq and new_turns_bow: id_dialog["pos_conv_bow_lst"].append(new_turns_bow) id_dialog["pos_conv_seq_lst"].append(new_turns_seq) for turns in dialog["neg_conv_lst"]: new_turns_bow = [] new_turns_seq = [] for turn in turns: id_turn_seq = self._sent2id_seq(turn, vocab_seq) id_turn_bow = self._sent2id_bow(turn, vocab_bow) if id_turn_seq and id_turn_bow: # filter empty utt new_turns_bow.append(id_turn_bow) new_turns_seq.append(id_turn_seq) word_cnt += len(id_turn_seq) msg_cnt += 1 if new_turns_seq and new_turns_bow: id_dialog["neg_conv_bow_lst"].append(new_turns_bow) id_dialog["neg_conv_seq_lst"].append(new_turns_seq) if id_dialog.pos_conv_bow_lst and id_dialog.neg_conv_bow_lst: results.append(id_dialog) print("Load seq with %d msgs, %d words" % (msg_cnt, word_cnt)) return results, msg_cnt, word_cnt def _to_id_corpus_bow(self, data, vocab): results = [] word_cnt = 0 msg_cnt = 0 for dialog in data: # convert utterance and feature into numeric numbers id_dialog = Pack(title=self._sent2id_bow(dialog["title"], vocab), op=self._sent2id_bow(dialog["op"], vocab), pos_conv_bow_lst=[], neg_conv_bow_lst=[]) for turns in dialog["pos_conv_lst"]: new_turns = [] for turn in turns: id_turn = self._sent2id_bow(turn, vocab) if id_turn: # filter empty utt new_turns.append(id_turn) word_cnt += np.sum([j for i, j in id_turn]) msg_cnt += 1 if new_turns: id_dialog["pos_conv_bow_lst"].append(new_turns) for turns in dialog["neg_conv_lst"]: new_turns = [] for turn in turns: id_turn = self._sent2id_bow(turn, vocab) if id_turn: # filter empty utt new_turns.append(id_turn) word_cnt += np.sum([j for i, j in id_turn]) msg_cnt += 1 if new_turns: id_dialog["neg_conv_bow_lst"].append(new_turns) if id_dialog.pos_conv_bow_lst and id_dialog.neg_conv_bow_lst: results.append(id_dialog) print("Load bow with %d msgs, %d words" % (msg_cnt, word_cnt)) return results, msg_cnt, word_cnt def get_corpus_bow(self, keep_stopwords=True): if keep_stopwords: vocab = self.vocab_bow else: vocab = self.vocab_bow_non_stopwords id_train = self._to_id_corpus_bow(self.train_corpus, vocab) id_test = self._to_id_corpus_bow(self.test_corpus, vocab) return Pack(train=id_train, test=id_test, vocab_size=len(vocab)) def get_corpus_seq(self): vocab = self.vocab_seq id_train = self._to_id_corpus_seq(self.train_corpus, vocab) id_test = self._to_id_corpus_seq(self.test_corpus, vocab) return Pack(train=id_train, test=id_test, vocab_size=len(vocab)) def get_corpus(self): id_train = self._to_id_corpus(self.train_corpus, self.vocab_seq, self.vocab_bow) id_test = self._to_id_corpus(self.test_corpus, self.vocab_seq, self.vocab_bow) # id_valid = self._to_id_corpus(self.valid_corpus, self.vocab_seq, self.vocab_bow) return Pack(train=id_train, test=id_test, vocab_size=len(self.vocab_bow))
stop = set(stopwords.words('english')) stop_words = [ 'name', 'traceback', 'time', 'require', 'create', 'yamanashi', 'int', 'byte', 'lyt', 'still', 'thu', 'total', 'cisco', 'type', 'actual', 'node', 'show', 'needed', 'init', 'clear', 'set', 'ok', 'please', 'jan', 'feb', 'mar', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'mon', 'tue', 'wed', 'thur', 'fri', 'sat', 'sun', 'utc', 'ist', 'changed', 'info', 'saved', 'successfully', 'need', 'collecting', 'second', 'minute', 'hour', 'timer', 'timed', 'manager', 'director', 'major', 'fujitsu', 'us', 'india', 'united states', 'japan', 'china' ] stop_words = list(set(list(stop) + stop_words)) for word in stop_words: if (word in dictionary.token2id): dictionary.filter_tokens(bad_ids=[dictionary.token2id[word]]) # Vectorize data # Bag-of-words representation of the documents. corpus = [dictionary.doc2bow(doc) for doc in docs] print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(corpus)) # Set training parameters. num_topics = 8 chunksize = 2000 passes = 20 iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. # Make a index to word dictionary.
for indexOfWord in range(len(listOfParagraphs[paragraph])): listOfParagraphs[paragraph][indexOfWord] = stemmer.stem(listOfParagraphs[paragraph][indexOfWord]) # Making dictionary and removing stopwords dictionary = Dictionary(listOfParagraphs) f = codecs.open("common-english-words.txt", "r", "utf-8") stopwords = f.read().split(',') stopword_ids = [] for word in stopwords: try: stopwordid = dictionary.token2id[word] stopword_ids.append(stopwordid) except: continue dictionary.filter_tokens(stopword_ids) documentToBow = [] for para in listOfParagraphs: documentToBow.append(dictionary.doc2bow(para)) # Creating tfidf model, lsi model and matrices tfidfModel = gensim.models.TfidfModel(documentToBow) tfidfCorpus = tfidfModel[documentToBow] tfidfMatrix = gensim.similarities.MatrixSimilarity(tfidfCorpus) lsiModel = gensim.models.LsiModel(tfidfCorpus, id2word=dictionary, num_topics=100) lsiCorpus = lsiModel[documentToBow] lsiMatrix = gensim.similarities.MatrixSimilarity(lsiCorpus) print("Report and try to interpret first 3 LSI topics: ") topics = lsiModel.show_topics(3) print("3 first lsi topics: ", topics)
def build_dict(data_lst): dictionary = Dictionary(data_lst) dictionary.filter_tokens(list(map(dictionary.token2id.get, STOPWORDS))) dictionary.filter_extremes(no_below=3) #, keep_n=10000) dictionary.compactify() return dictionary