def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') parser = OptionParser() parser.add_option('-f', '--corpus-file') parser.add_option('-p', '--parse-procs', default=1, type=int) parser.add_option('-s', '--sublexicalize-procs', default=1, type=int) parser.add_option('-t', '--tfidf-model') parser.add_option('-v', '--vocabulary') parser.add_option('-m', '--model-file') opts, args = parser.parse_args() corpus_fn = opts.corpus_file or sys.exit() n_proc_parse = opts.parse_procs n_proc_sublex = opts.sublexicalize_procs vocab_fn = opts.vocabulary tfidf_fn = opts.tfidf_model model_fn = opts.model_file or sys.exit() with BZ2File(corpus_fn) as f: corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()), order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex, create_dictionary=False) if vocab_fn and os.path.exists(vocab_fn): logging.info("Loading vocabulary from %s" % vocab_fn) vocab = Dictionary.load(vocab_fn) else: logging.info("Creating vocabulary") start = time.clock() vocab = Dictionary(corpus.get_texts()) end = time.clock() logging.info("Vocabulary created in %d seconds" % (end - start)) if vocab_fn: logging.info("Saving dictionary to %s" % vocab_fn) vocab.save(vocab_fn) corpus.dictionary = vocab corpus.dictionary.filter_extremes(no_below=5, no_above=.8) corpus.dictionary.compactify() if tfidf_fn and os.path.exists(tfidf_fn): logging.info("Reading TF-IDF model from %s" % tfidf_fn) tfidf = TfidfModel.load(tfidf_fn) else: logging.info("creating TF-IDF model") tfidf = TfidfModel(corpus) if tfidf_fn: logging.info("Saving TFF-IDF model to %s" % tfidf_fn) tfidf.save(tfidf_fn) bow_corpus = (tfidf[art] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary) model.save(model_fn)
def save_dictionary( dic: corpora.Dictionary, filename: str ) -> None: dic.save(filename) print("saved dictionary: {} items to {}".format( len(dic.values()), filename ))
def bag_of_words(lemma): "Takes in lemmatised words and returns a bow." # Create bag of words from dictionnary dictionary = Dictionary(lemma) dictionary.save('text.dict') # Term frequency–inverse document frequency (TF-IDF) bow = [dictionary.doc2bow(l) for l in lemma] # Calculates inverse document counts for all terms return (bow, dictionary)
def fetch_dict(): global dictionary dictionary=Dictionary([i for i in my_dictionary]) once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() dictionary.save("Topic/dic.loc") return dictionary
def create_dictionary(doc_iterator, dict_file, as_text=False): """ Creates a gensim.corpora.Dictionary object from given document iterator and serializes it to given dict_file (filename) in a memory efficient way. @Params: as_text - flag: dictionary saved as text (default: binary) """ d = Dictionary(doc.strip().lower().split() for doc in doc_iterator) if as_text: d.save_as_text(dict_file) else: d.save(dict_file)
def build_corpora(db): dictionary = Dictionary() corpus = [] for article in db.articles.find(): text = article['clean_text'] dictionary.doc2bow(text, allow_update=True) dictionary.filter_extremes() for article in db.articles.find(): text = article['clean_text'] corpus.append(dictionary.doc2bow(text)) gensim.corpora.MmCorpus.serialize('data/corpus.mm', corpus) dictionary.save('data/cnn.dict') return corpus, dictionary
def fetch_dict(): print "Fetching Dictionary...", try: dictionary=Dictionary().load("Topic/dic.tm") print "Dictionary loaded!" except IOError: print "Dictionary not found, building Dictionary..." dictionary=Dictionary(i for i in MyDictionary()) once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() print "\rDictionary Built!" print dictionary dictionary.save("Topic/dic.tm") return dictionary
def run(): try: print "starting to build LSI Model" start = datetime.now() documents = Feature.objects.exclude(text=None).values_list("text", flat=True) number_of_documents = len(documents) print "number_of_documents:", number_of_documents stopwords = [] stopwords += [month.lower() for month in month_to_number.keys()] stopwords += nltk_stopwords.words('english') print "stopwords:", len(stopwords) with open(path_to_directory_of_this_file + "/stopwords.txt") as f: stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")]) stopwords = set(stopwords) texts = [[word for word in document.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() if word not in stopwords and len(word) > 3 ] for document in documents] counter = Counter() for text in texts: counter.update(text) texts = [[token for token in text if counter[token] > 1] for text in texts] dictionary = Dictionary(texts) print "dictionary:", dictionary dictionary.save(path_to_directory_of_this_file + "/dictionary") corpus = [dictionary.doc2bow(text) for text in texts] print "corpus:", type(corpus) print "generating lsi model" lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10) print "saving LSI model" lsi.save(path_to_directory_of_this_file + "/model") Topic.objects.all().delete() topics = [] for topic in lsi.show_topics(): topics.append(Topic(id=topic[0], name=prettify_topic(topic[1]))) Topic.objects.bulk_create(topics) except Exception as e: print e
def main(): parser = ArgumentParser() parser.add_argument('-e', '--encoding') parser.add_argument('-o', '--output-file') args = parser.parse_args() encoding = args.encoding output_fn = args.output_file if not output_fn: sys.exit(-1) if encoding: sys.stdout = codecs.getwriter(encoding)(sys.stdout) sys.stdin = codecs.getreader(encoding)(sys.stdin) texts = (line.split() for line in sys.stdin) logging.info('Creating vocabulary ...') vocab = Dictionary(texts) logging.info('Saving vocabulary to %s ...' % (output_fn + '.bz2')) vocab.save(output_fn) logging.info('Compressing vocabulary ...') with open(output_fn, 'rb') as input: with bz2.BZ2File(output_fn + '.bz2', 'wb', compresslevel=9) as output: copyfileobj(input, output) os.remove(output_fn) logging.info('Creating IDF model ...') tfidf = TfidfModel(dictionary=vocab) logging.info('Saving IDF model to %s ...' % (output_fn + '.tfidf.bz2')) tfidf.save(output_fn + '.tfidf') logging.info('Compressing IDF model ...') with open(output_fn + '.tfidf', 'rb') as input: with bz2.BZ2File(output_fn + '.tfidf.bz2', 'wb', compresslevel=9) as output: copyfileobj(input, output) os.remove(output_fn + '.tfidf')
def prepare_data(): # returns the corpus object required by learn # skips datasets/dspace/2481.json base = 'datasets/dspace' documents = [] for filename in tqdm(os.listdir(base)): path = os.path.join(base, filename) with open(path) as f: d = json.load(f) abstract = d['abstract'] if abstract is not None: words = tokenize(abstract.split()) documents.append(words) dictionary = Dictionary(documents) dictionary.filter_extremes(no_below=5, no_above=0.3) dictionary.save('lda.dict') corpus = map(dictionary.doc2bow, documents) return corpus
def get_topics_lda(tokens, n_topics=10): """ Using the `gensim` package for LDA. LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia). `gensim` is a package for topic modeling only. So for a particular topic modeling task, it is a lighter option to install and run. Also it can be run distributed and updated over an existing model :param tokens: Preprocessed tokens for faster dictionary building :param n_topics: Number of topics to decompose data to :return: list() of topics """ dict_file = 'resources/deals.dict' if not os.path.isfile(dict_file): print "Dictionary file does not exist. Creating one" dictionary = Dictionary(tokens) freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1] dictionary.filter_tokens(freq1) dictionary.compactify() dictionary.save(dict_file) dictionary = Dictionary.load(dict_file) # print dictionary corpus_file = 'resources/deals.mm' if not os.path.isfile(corpus_file): print "Corpus file does not exist. Creating one" corpus = [dictionary.doc2bow(token) for token in tokens] MmCorpus.serialize(corpus_file, corpus) mm = MmCorpus(corpus_file) # print mm # tfidf = TfidfModel(mm) # corpus_tfidf = tfidf[mm] lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000, passes=1) topics = [] for i in range(0, n_topics): words = lda.print_topic(i).split('+') topic = [] for word in words: score, w = word.split('*') topic.append((w, score)) topics.append(topic) return topics
def _build_model(self, all_documents, remove_once=False): ''' Builds the lsa model Returns: dictionary, corpus ''' doc_hash = hash_obj(all_documents) corp_cache_path = CACHE_DIR + '/' + doc_hash +\ '_corp_' + str(int(remove_once)) dic_cache_path = CACHE_DIR + '/' + doc_hash +\ '_dic_' + str(int(remove_once)) lsi_cache_path = CACHE_DIR + '/' + doc_hash +\ '_lsi_' + str(int(remove_once)) if os.path.exists(corp_cache_path) \ and os.path.exists(dic_cache_path)\ and os.path.exists(lsi_cache_path): lsi = models.LsiModel.load(lsi_cache_path) corp = MmCorpus(corp_cache_path) dic = Dictionary.load(dic_cache_path) else: texts = [self.tokenize(doc) for doc in all_documents] all_tokens = sum(texts, []) if remove_once: tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dic = Dictionary(texts) corp = [dic.doc2bow(text) for text in texts] MmCorpus.serialize(corp_cache_path, corp) dic.save(dic_cache_path) lsi = models.LsiModel( corp, id2word=dic, num_topics=20) lsi.save(lsi_cache_path) return dic, corp, lsi
class TfidfVectorizer(): """ Transform text to tf-idf representation """ def __init__(self): self.base_path = os.path.dirname(__file__) self.dictionary_path = os.path.join(self.base_path, "dictionary") self.tf_idf_model_path = os.path.join(self.base_path, "tfidf") self.stemmer = NepStemmer() self.tf_idf_model = None def get_tokens(self, document): if not self.stemmer: raise Exception("Stemmer not available") return self.stemmer.get_stems(document) def construct_model(self, documents): logging.basicConfig( format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO ) logging.info("Obtaining word tokens") tokens = [self.get_tokens(document) for document in documents] # self.tf_idf_model = TfidfModel(tokens) logging.info("Constructing dictionary") self.dictionary = Dictionary(tokens) self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000) self.dictionary.compactify() self.dictionary.save(self.dictionary_path) logging.info("Constructing TF-IDF model") self.tf_idf_model = TfidfModel(dictionary=self.dictionary) self.tf_idf_model.save(self.tf_idf_model_path) def load_data(self): if not self.tf_idf_model: if not os.path.exists(self.tf_idf_model_path): raise Exception('TF-IDF model file not found') self.dictionary = Dictionary.load(self.dictionary_path) self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path) def doc2vector(self, document): """ Returns the sparse tf-idf vector for given document """ tokens = self.get_tokens(document) bag_of_words = self.dictionary.doc2bow(tokens) return (self.tf_idf_model[bag_of_words]) def obtain_feature_vector(self, document): """ Returns a single dense tf-idf vector for a given document """ self.load_data() tf_idf_vector = matutils.sparse2full( self.doc2vector(document), self.no_of_features ).reshape(1, -1) return tf_idf_vector def obtain_feature_matrix(self, documents): """ Returns the tf-idf dense matrix for the given documents """ self.load_data() input_matrix_sparse = [ self.doc2vector(x) for x in documents ] no_of_features = len(self.tf_idf_model.idfs) input_matrix = matutils.corpus2dense( input_matrix_sparse, no_of_features ).transpose() return input_matrix
class Builder(object): def __init__(self, ndocs, phrase_min_count=5, vocabulary_size=10000, bigram_min_count=5, bigram_threshold=10, trigram_min_count=5, trigram_threshold=10, substitutions=dict(), data_directory='./data', model_directory='./model'): self.ndocs = ndocs self.phrase_min_count = phrase_min_count self.vocabulary_size = vocabulary_size self.bigram_min_count = bigram_min_count self.bigram_threshold = bigram_threshold self.trigram_min_count = trigram_min_count self.trigram_threshold = trigram_threshold self.substitutions = substitutions self.data_directory = data_directory self.model_directory = model_directory self.load_bad_phrases() def tokenize(self, text): return [token.lower() for token in word_tokenize(text)] def stream_sentences(self, texts, description="Streaming sentences ..."): with tqdm.tqdm(texts) as pbar: pbar.set_description(description) for text in pbar: for sentence in sent_tokenize(text): yield self.tokenize(sentence) def load_bad_phrases(self): with open("%s/bad-phrases.txt" % self.data_directory, mode='r', encoding='UTF-8') as fp: self.bad_phrases = set( [phrase.strip() for phrase in fp.readlines()]) def add_bad_phrase(self, phrase): self.bad_phrases.add(phrase) def save_bad_phrases(self): bad_phrases = list(self.bad_phrases) bad_phrases.sort() with open("%s/bad-phrases.txt" % self.data_directory, mode='w', encoding='UTF-8') as fp: for phrase in bad_phrases: fp.write("%s\n" % phrase) def train_phrasers(self, texts): bigrams = Phrases(self.stream_sentences( texts, description="Streaming text for bigram phraser ..."), min_count=self.bigram_min_count, threshold=self.bigram_threshold) #print("Training bigram phraser ...") self.bigram_phraser = Phraser(bigrams) #print("Collecting trigrams ...") trigrams = Phrases(self.bigram_phraser[self.stream_sentences( texts, description="Streaming text for trigram phraser ...")], min_count=self.trigram_min_count, threshold=self.trigram_threshold) #print("Training trigram phraser ...") self.trigram_phraser = Phraser(trigrams) def save_phrasers(self): path = os.path.join(self.model_directory, "bigram-phraser.pkl") self.bigram_phraser.save(path) path = os.path.join(self.model_directory, "trigram-phraser.pkl") self.trigram_phraser.save(path) def load_phrasers(self): path = os.path.join(self.model_directory, "bigram-phraser.pkl") self.bigram_phraser = Phraser.load(path) path = os.path.join(self.model_directory, "trigram-phraser.pkl") self.trigram_phraser = Phraser.load(path) def prepare_text(self, text): for key, value in self.substitutions.items(): text = text.replace(key, value) tokens = self.tokenize(text) tokens = self.bigram_phraser[tokens] tokens = self.trigram_phraser[tokens] return [token for token in tokens if not token in self.bad_phrases] def prepare_texts(self, texts): with tqdm.tqdm(texts) as pbar: pbar.set_description("Preparing texts ...") prepared_texts = [self.prepare_text(text) for text in pbar] return prepared_texts def keep_phrase(self, phrase, cnt): if "'" in phrase: return False for c in PUNCTUATION: if c in phrase: return False if phrase in self.bad_phrases: return False phrase_set = set(phrase) if SYMBOLS & phrase_set: return False if (LETTERS & set(phrase)) and cnt > self.phrase_min_count: return True return False def build_vocabulary(self, texts, save=False): self.ndocs = len(texts) with tqdm.tqdm(texts) as pbar: pbar.set_description("Building vocabulary over %d documents." % self.ndocs) phrase_map = {} for document in pbar: for phrase in document: if not phrase in phrase_map: phrase_map[phrase] = 0 phrase_map[phrase] += 1 phrases = list(phrase_map.keys()) phrases = sorted(phrases, key=lambda phrase: -phrase_map[phrase]) vocabulary = [ phrase for phrase in phrases if self.keep_phrase(phrase, phrase_map[phrase]) ] hyphenated = { phrase.replace('-', '_') for phrase in vocabulary if "-" in phrase } vocabulary = [ phrase for phrase in vocabulary if not phrase in hyphenated ][:self.vocabulary_size] if save: path = os.path.join( self.data_directory, "vocabulary-%d-%d-%d.tsv" % (len(texts), self.phrase_min_count, self.vocabulary_size)) fp = open(path, mode='w', encoding='UTF-8') for phrase in vocabulary: fp.write("%s\t%d\n" % (phrase, phrase_map[phrase])) fp.close() self.vocabulary = set(vocabulary) def load_vocabulary(self): path = os.path.join( self.data_directory, "vocabulary-%d-%d-%d.tsv" % (self.ndocs, self.phrase_min_count, self.vocabulary_size)) fp = open(path, mode='r', encoding='UTF-8') self.vocabulary = set([]) for line in fp: line = line.strip() if line: phrase, cnt = line.split('\t') self.vocabulary.add(phrase) fp.close() def build_document(self, text): return [phrase for phrase in text if phrase in self.vocabulary] def build_corpus(self, texts): with tqdm.tqdm(texts) as pbar: pbar.set_description("Building corpus ...") corpus = [self.build_document(text) for text in pbar] return corpus def build_dictionary(self, corpus, save=False): self.dictionary = Dictionary(corpus) self.dictionary.filter_extremes(no_below=self.phrase_min_count, no_above=0.6, keep_n=self.vocabulary_size) if save: self.save_dictionary() def save_dictionary(self, path=None): if path is None: path = os.path.join(self.model_directory, "dictionary.pkl") self.dictionary.save(path) def load_dictionary(self, path=None): if path is None: path = os.path.join(self.model_directory, "dictionary.pkl") self.dictionary = Dictionary.load(path) def encode_corpus(self, corpus): return [self.dictionary.doc2bow(document) for document in corpus]
# docs[idx].append(token) # stem the token ps = PorterStemmer() docs = [[ps.stem(token) for token in doc] for doc in docs] # Remove rare and common tokens. # Create a dictionary representation of the documents. dictionary = Dictionary(docs) # Filter out words that occur less than 1000 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=1000, no_above=0.5) # path for saved dictionary: /Users/rachelzheng/opt/anaconda3/lib/python3.7/site-packages/gensim/test/test_data/dict-www-cnndm # saved /home/rachelzheng/www/venv/lib/python3.6/site-packages/gensim/test/test_data/dict-www-cnndm dictionary.save(datapath('dict-www-cnndm-unigram')) # dictionary = Dictionary.load(datapath('dict-www-cnndm')) # Bag-of-words representation of the documents. # corpus = [dictionary.doc2bow(doc) for doc in docs] # Number of unique tokens: 88978 - plus bigram # Number of unique tokens: 44984 - unigram - 20 documents # Number of unique tokens: 21185 - unigram - 100 documents # Number of unique tokens: 6439 - unigram - 1000 docyments # Number of documents: 287113 print('Number of unique tokens: %d' % len(dictionary)) # print('Number of documents: %d' % len(corpus)) # Make a index to word dictionary. # temp = dictionary[0] # This is only to "load" the dictionary.
except KeyError as e: print(e) continue # replace encoded characters for code, value in replace_dict.items(): text = text.replace(code, value).lower().strip() keywords = [ kw.replace(code, value).lower().strip() for kw in keywords ] abstract = abstract.replace(code, value).lower().strip() title = title.replace(code, value).lower().strip() text = title + abstract + text doc = nlp(text) # clean: doc = [ token.lemma_ for token in doc if not token.is_stop and token.shape > 2 and not token.is_currency and not token.is_punct and not token.is_digit ] documents.append(doc) print("create dict") dct = Dictionary(documents) dct.save("../data/models/tfidf/dictionary.model") print("create tfidf") model = TfidfModel(dictionary=dct) model.save("../data/models/tfidf/tfidf.model")
def build_lda_model(tokens_tags, pos_tags, use_nouns=True, use_verbs=True, use_all=False, num_of_topics=8, passes=25, verbose=True): path = os.getcwd()[:os.getcwd().rfind('/')] topics_filename = str(num_of_topics) + "topics" if use_nouns: topics_filename += "_nouns" if use_verbs: topics_filename += "_verbs" if use_all: topics_filename += "_all" # Set the LDA, Dictionary and Corpus filenames lda_filename = path + "/models/topic_models/lda_" + topics_filename + ".model" dict_filename = path + "/res/topic_data/dict/dict_" + topics_filename + ".dict" corpus_filename = path + "/res/topic_data/corpus/corpus_" + topics_filename + ".mm" # Build a topic model if it wasn't created yet if not os.path.exists(lda_filename): # Extract the lemmatized documents docs = [] for index in range(len(tokens_tags)): tokens = tokens_tags[index].split() pos = pos_tags[index].split() docs.append( data_proc.extract_lemmatized_tweet(tokens, pos, use_verbs, use_nouns, use_all)) # Compute the dictionary and save it dictionary = Dictionary(docs) dictionary.filter_extremes(keep_n=40000) dictionary.compactify() Dictionary.save(dictionary, dict_filename) # Compute the bow corpus and save it corpus = [dictionary.doc2bow(d) for d in docs] MmCorpus.serialize(corpus_filename, corpus) if verbose: print("\nCleaned documents:", docs) print("\nDictionary:", dictionary) print("\nCorpus in BoW form:", corpus) # Start training an LDA Model start = time.time() print("\nBuilding the LDA topic model...") lda_model = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary) lda_model.save(lda_filename) end = time.time() print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0)) if verbose: print("\nList of words associated with each topic:") lda_topics = lda_model.show_topics(formatted=False) lda_topics_list = [[word for word, prob in topic] for topic_id, topic in lda_topics] print([t for t in lda_topics_list]) # Load the previously saved dictionary dictionary = Dictionary.load(dict_filename) # Load the previously saved corpus mm_corpus = MmCorpus(corpus_filename) # Load the previously saved LDA model lda_model = LdaModel.load(lda_filename) # Print the top 10 words for each topic if verbose: for topic_id in range(num_of_topics): print("\nTop 10 words for topic ", topic_id) print([ dictionary[word_id] for (word_id, prob) in lda_model.get_topic_terms(topic_id, topn=10) ]) index = 0 if verbose: for doc_topics, word_topics, word_phis in lda_model.get_document_topics( mm_corpus, per_word_topics=True): print('Index ', index) print('Document topics:', doc_topics) print('Word topics:', word_topics) print('Phi values:', word_phis) print('-------------- \n') index += 1 return dictionary, mm_corpus, lda_model
def run_tm(topics, below, above, chunksize, passes, iterations): m, valid = arevalid(topics, below, above, chunksize, passes, iterations) if not valid: fehlerfenster = Toplevel() fehlerfenster.title('Fehler') fehlerfenster.geometry('300x300') # Label mit der Fehlermeldung labelfehler = Label(master=fehlerfenster, text=m) labelfehler.place(x=10, y=10, width=300, height=300) else: with open('../data/docs', 'rb') as f: docs = pickle.load(f) tweet_dictionary = Dictionary(docs) tweet_dictionary.filter_extremes(no_below=int(below), no_above=float(above)) tweet_dictionary.save('../data/tweet_dictionary') ngram_docs = ngrams(input_docs=docs) corpus = make_bow_corpus(tweet_dictionary, ngram_docs) with open('../data/bow_corpus', 'wb') as f: pickle.dump(corpus, f) print('Number of unique tokens: %d' % len(tweet_dictionary)) print('Number of documents: %d' % len(corpus)) """Training parameters.""" num_topics = int( topics ) # Number of topics, here relatively low so we can interpret them more easily -> can be set higher chunk_size = int( chunksize ) # Numbers of documents fed into the training algorithm (we have 7) passes = int(passes) # Number of times trained on the entire corpus iterations = int(iterations) # Number of loops over each document eval_every = None # Don't evaluate model perplexity, takes too much time. """ Make a index to word dictionary.""" temp = tweet_dictionary[0] # This is only to "load" the dictionary. id2word = tweet_dictionary.id2token """Create model We set alpha = 'auto' and eta = 'auto'. Again this is somewhat technical, but essentially we are automatically learning two parameters in the model that we usually would have to specify explicitly.""" model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunk_size, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) model_file = '../data/model/LDA_model_v1' model.save(model_file) """ Tests """ # Top topics top_topics = model.top_topics( corpus ) # , num_words=20) Default value = 20, input is our corpus in BOW format # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. """Topic Coherence measures score a single topic by measuring the degree of semantic similarity between high scoring words in the topic. These measurements help distinguish between topics that are semantically interpretable topics and topics that are artifacts of statistical inference """ avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) pprint(top_topics)
mecab = MeCab.Tagger("-Owakati") # 辞書に含めない単語たち words_blacklist = [ ">>", # チャットのアノテーション "some_agent", "\u3000", # 全角スペースを意味している "。", "、", ] dct = Dictionary() # csvファイルの読み込み df = pd.read_csv(filepath, delimiter=",", names=["talker", "words", "type"]) # 文を分かち書き -> 半角スペースで区切り -> 最後の1文字(改行コード)を消したリストを得る wakati_df = df["words"].map(lambda x: mecab.parse(x).split(" ")[:-1]) # 辞書に追加 dct.add_documents(wakati_df) # ブラックリストの辞書内でのidを得る words_blacklist_id = dct.doc2idx(words_blacklist) # 辞書から削除 dct.filter_tokens(bad_ids=words_blacklist_id) #dct.filter_n_most_frequent(600) # 辞書の保存 dct.save(os.path.join(filedir, ".".join([filename, "dict"]))) # 辞書の中身と単語数の表示 print(dct.token2id) print(len(dct.token2id))
print("Number of distinct users:", len(users)) print("Number of tweets total:", db_collection.count()) print( "Concatenate all tweets of one person in one month to one single document..." ) tweet_docs, tweet_docs_prep = concatenateTweetsOfMonthToDoc( users, db_collection, format="mongoDB") print("Number of documents (concatenated tweets): ", len(tweet_docs)) print("Create dictionary...") dictionary = Dictionary(tweet_docs_prep) print("Save dictionary (nTokens={}) to file {}...".format( len(dictionary.values()), path_save_dict)) dictionary.save(path_save_dict) print("Create bag of words...") corpus = [dictionary.doc2bow(text) for text in tweet_docs_prep] list_num_topics = args.numberTopics.replace(" ", "").split( ",") #[16, 20, 22, 24, 26, 28, 30] print("Calculate LDA...") lmlist, c_v, logPerplex = calculateLDA(dictionary=dictionary, corpus=corpus, texts=tweet_docs_prep, list_num_topics=list_num_topics, saveModelPath=path_save_LDA) # print("Plot...") # plt.figure()
class VAEEmbeddingsSearchEngine(SmartSearchEngine): # # Uses a Keras model as the base to compute document similarity def __init__(self): super(VAEEmbeddingsSearchEngine, self).__init__() self._service_array = [] self._index = None self._corpus = None self._train_model = False self._load_wmd = False self._preprocessor = StringPreprocessor('english.long') def load_configuration(self, configuration_file): super(VAEEmbeddingsSearchEngine, self).load_configuration(configuration_file) config = configparser.ConfigParser() config.read(configuration_file) latent_dim = config.getint('RegistryConfigurations', 'latent_dim') intermediate_dim = config.getint('RegistryConfigurations', 'intermediate_dim') batch_size = config.getint('RegistryConfigurations', 'batch_size') epochs = config.getint('RegistryConfigurations', 'epochs') learning_rate = config.getfloat('RegistryConfigurations', 'learning_rate') epsilon_std = config.getfloat('RegistryConfigurations', 'epsilon_std') self._precomputed_vectors_path = config.get( 'RegistryConfigurations', 'precomputed_vectors_path') if config.get('RegistryConfigurations', 'load_wmd_model').lower() == 'true': self._load_wmd = True if config.get('RegistryConfigurations', 'train_model').lower() == 'true': self._train_model = True if config.get('RegistryConfigurations', 'reproducible').lower() == 'true': self._model = VAEWasserstein(latent_dim, intermediate_dim, epsilon_std, batch_size, epochs, learning_rate, reproducible=True) else: self._model = VAEWasserstein(latent_dim, intermediate_dim, epsilon_std, batch_size, epochs, learning_rate) else: self._model = VAEWasserstein() self._model.load('models/vae.h5') self._vectorizer = Dictionary.load('models/vectorizer.npy') def _doc_to_nbow(self, document): vocab_len = len(self._vectorizer) d = np.zeros(vocab_len, dtype=np.double) nbow = self._vectorizer.doc2bow(document) # Word frequencies. doc_len = len(document) for idx, freq in nbow: d[idx] = freq / float(doc_len) # Normalized word frequencies. return d def _corpus_to_nbow(self, documents): corpus = np.zeros((len(documents), len(self._vectorizer))) for i in range(len(documents)): corpus[i, :] = self._doc_to_nbow(documents[i]) return corpus def unpublish(self, service): pass def _preprocess(self, bag_of_words): words = bag_of_words.get_words_list() return self._preprocessor(words) def _save_obj(self, obj, name): with open('models/' + name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def _load_obj(self, name): with open('models/' + name + '.pkl', 'rb') as f: return pickle.load(f) def _create_filter_vocab(self, documents, vocab): filter_set = set() for document in documents: for word in document: if word not in vocab: filter_set.add(word) return filter_set def _after_publish(self, documents): if self._train_model: filter_set = None if self._load_wmd: filter_set = self._load_obj('word_filter') documents = [[x for x in document if x not in filter_set] for document in documents] self._vectorizer = Dictionary(documents) distance = WMDDistance.load('models/distances.npy', self._vectorizer) else: self._word2vec_model = models.KeyedVectors.load_word2vec_format( self._precomputed_vectors_path, binary=False) self._word2vec_model.init_sims(replace=True) filter_set = self._create_filter_vocab( documents, self._word2vec_model.vocab) self._save_obj(filter_set, 'word_filter') documents = [[x for x in document if x not in filter_set] for document in documents] self._vectorizer = Dictionary(documents) distance = WMDDistance(self._vectorizer, self._word2vec_model) distance.save('models/distances') X = self._corpus_to_nbow(documents) self._vectorizer.save(open('models/vectorizer.npy', 'wb')) X_train, X_test, _, _ = train_test_split(X, np.zeros(X.shape), test_size=0.33, random_state=23) print(X_train) print(X_test) self._model.train(X_train, X_test, distance.get_distances()) self._model.save('models/vae.h5') else: filter_set = self._load_obj('word_filter') documents = [[x for x in document if x not in filter_set] for document in documents] X = self._corpus_to_nbow(documents) self._index = self._model.transform(X) def publish(self, service): pass def find(self, query): query = StringTransformer().transform(query) query_vector = self._doc_to_nbow( self._query_transformer.transform(query).get_words_list()) query_vector = np.expand_dims(query_vector, axis=0) query_vae = self._model.transform(query_vector) results = cosine_similarity(query_vae, self._index) results = sorted(enumerate(results[0]), key=lambda item: -item[1]) result_list = [] for tuple_result in results: result_list.append(self._service_array[tuple_result[0]]) return result_list def number_of_services(self): pass
class Preprocessor(object): def __init__(self): self.text_dict = Dictionary() self.label_dict = Dictionary() def SentencesToVectors(self, splited_sentences): vec_sentences = [] for splited_sentence in tqdm(splited_sentences): vec_sentences.append([ self.text_dict.token2id.get(word, 0) for word in splited_sentence ]) return vec_sentences def LabelsToVectors(self, splited_labels): vec_labels = [ self.label_dict.token2id[label] for label in splited_labels ] return vec_labels def SaveDicts(self): self.text_dict.save(CURRENT_MAIN_PATH + "/dicts/text_dict.dict") self.label_dict.save(CURRENT_MAIN_PATH + "/dicts/label_dict.dict") def LoadDicts(self): self.text_dict = self.text_dict.load(CURRENT_MAIN_PATH + '/dicts/text_dict.dict') self.label_dict = self.label_dict.load(CURRENT_MAIN_PATH + '/dicts/label_dict.dict') def SaveTrainingData(self, vec_sentences, vec_labels): np.savez(CURRENT_MAIN_PATH + "/npz_data/train.npz", x_data=vec_sentences, y_data=vec_labels) def SaveTestingData(self, vec_sentences): np.savez(CURRENT_MAIN_PATH + "/npz_data/test.npz", x_data=vec_sentences) def PreprocessTSV(self, mode='train'): if mode == 'train': filepath = CURRENT_MAIN_PATH + '/raw_data/train.tsv' raw_data = pandas.read_csv(filepath, sep='\t', engine='c') raw_sentences = raw_data.iloc[:, 0] raw_triples = raw_data.iloc[:, 1] # Split by jieba print("Spliting sentences and labels") splited_sentences = [ Util.GetPreProcessedSentence(raw_sentence) for raw_sentence in tqdm(raw_sentences) ] splited_labels = [ Util.GetPreProcessedLabels(raw_triple) for raw_triple in tqdm(raw_triples) ] # Add words to text dictionary and triple-labels dictionary self.text_dict.add_documents(splited_sentences) self.label_dict.add_documents([[label] for label in splited_labels]) # Transform sentences to vectors vec_sentences = self.SentencesToVectors(splited_sentences) # Transform labels to vectors vec_labels = self.LabelsToVectors(splited_labels) # Save Dictionaries and training data self.SaveDicts() self.SaveTrainingData(vec_sentences, vec_labels) elif mode == 'test': filepath = CURRENT_MAIN_PATH + '/raw_data/test.tsv' raw_data = pandas.read_csv(filepath, sep='\t', engine='c') raw_sentences = raw_data.iloc[:, 0] splited_sentences = [ Util.GetPreProcessedSentence(raw_sentence) for raw_sentence in tqdm(raw_sentences) ] # Load Dictionaries self.LoadDicts() # Transform sentences to vectors vec_sentences = self.SentencesToVectors(splited_sentences) # Save testing data self.SaveTestingData(vec_sentences)
dest_file_prefix = sys.argv[2] epoch = int(sys.argv[3]) batch = int(sys.argv[4]) n_hidden = int(sys.argv[5]) BOS = '\t' EOS = '\n' df = pd.read_csv(data_file, names=('question', 'answer'), dtype='object') a_maxlen = df['answer'].map(len).max() + 2 ans = df['answer'].map(lambda a: f'{BOS}{a}{EOS}') dic = Dictionary([list(BOS + EOS + ' '.join(df.values.flatten()))]) dic.save(f'{data_file}.dic') padding_one_hot = lambda d, size: np.vstack( (np.eye(len(dic))[dic.doc2idx(list(d))], np.zeros( (size - len(d), len(dic))))) one_hot = lambda s: np.eye(len(dic))[dic.doc2idx(list(s))] sum_one_hot = lambda s: np.add.reduce(one_hot(s)) x1 = np.array([sum_one_hot(q) for q in df['question']]) x2 = np.array([padding_one_hot(a, a_maxlen) for a in ans]) y = np.array([np.vstack((d[1:], np.zeros((1, len(dic))))) for d in x2]) enc_inputs = Input(shape=(len(dic), )) enc_outputs = Dense(n_hidden, activation='relu')(enc_inputs)
def main(): parser = ArgumentParser( description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information" ) parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)") parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it") parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki") parser.add_argument("--model-id", default="model", help="Filename for created model.") parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).") parser.add_argument("--n-topics", default=10, help="Number of topics to model.") parser.add_argument("--n-passes", default=1, help="Number of passes for LDA model.") parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.") parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.") parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents") parser.add_argument("--index", help="Elasticsearch: index to read from.") parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.") parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.") parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.") opts = parser.parse_args() model_type = opts.model_type.lower() if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]: logging.error("Invalid model type %s" % model_type) parser.print_usage() exit(-1) logging.info("Using model type %s" % model_type) dump_fn = opts.dump_file limit = int(opts.limit) if opts.limit else None data_type = opts.dataset.lower() if data_type not in ["es", "wiki", "file"]: logging.error("Invalid dataset type %s" % data_type) parser.print_usage() exit(-1) limit = None if opts.limit: limit = int(opts.limit) if not dump_fn and data_type in ["wiki"]: logging.error("--dump-file required for wiki dataset") sys.exit(1) query = opts.query index = opts.index doc_type = opts.doc_type if data_type == "es" and index is None: logging.error( "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter" ) sys.exit(1) n_topics = int(opts.n_topics) n_passes = int(opts.n_passes) logging.info("Using %d topics." % n_topics) data_dir = opts.data_dir model_id = opts.model_id model_fn = "%s_%s_%d" % (model_id, model_type, n_topics) if data_dir: model_fn = "%s/%s" % (data_dir, model_fn) if model_type == "word2vec": w2v_size = int(opts.w2v_size) w2v_window = int(opts.w2v_window) model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size) logging.info("Writing models to %s." % model_fn) if data_type == "es": logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) dataset = ElasticsearchDataset( read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es ) elif data_type == "wiki": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) elif data_type == "file": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) vocab_file = opts.vocab vocab = Dictionary() sw = set(stopwords.words("norwegian")) if not vocab_file or model_type == "vocabulary": vocab.add_documents([get_tokenized(page, sw) for page in dataset]) vocab.filter_extremes() vocab.compactify() vocab.save(model_fn + ".vocab") else: vocab = Dictionary.load(vocab_file) if model_type == "vocabulary": return tfidf = TfidfModel(dictionary=vocab) if model_type == "lsi": corpus = IterableDataset(dataset, sw, vocab) model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab) elif model_type == "lda": corpus = IterableDataset(dataset, sw, vocab) model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab) elif model_type == "word2vec": corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) corpus.dictionary = vocab model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) elif model_type == "hdp": corpus = IterableDataset(dataset, sw, vocab) model = HdpModel(corpus=tfidf[corpus], id2word=vocab) logging.info(model) model.save(model_fn)
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) working_corpus = path.join(base_path, p['corpus_path'], p['corpus_name']) human_data_file = path.join(base_path, p['human_data_file']) lee_corpus = path.join(base_path, p['lee_corpus']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # remember starting time for runtime evaluation start = datetime.now() logger.info('loading word mapping') dictionary = Dictionary.load(path.join(base_path, p['corpus_path'], p['dict_name'])) Dictionary.save(dictionary, path.join(output_dir, p['dict_name'])) logger.info(dictionary) logger.info('loading corpus') corpus_bow = MmCorpus(working_corpus) logger.info("create preprocessing model and save it to disk") if p['pre_model'] == 'tfidf': pre_model = TfidfModel(corpus_bow, id2word=dictionary, normalize=True) elif p['pre_model'] == 'log_ent': pre_model = LogEntropyModel(corpus_bow, id2word=dictionary, normalize=True) else: raise ValueError('model parameter %s not known' % p['pre_model']) pre_model.save(os.path.join(output_dir, p['pre_model_extension'])) logger.info('initialize LSI model') lsi = models.LsiModel(pre_model[corpus_bow], id2word=dictionary, num_topics=p['num_topics']) lsi.save(os.path.join(output_dir, p['lsi_extension'])) logger.info('finished --> lsi model saved to: %s' % os.path.join(output_dir, p['lsi_extension'])) # check for correlation with lee human data logger.info('load smal lee corpus and preprocess') with open(lee_corpus, 'r') as f: preproc_lee_texts = preprocessing.preprocess_documents(f.readlines()) bow_lee_texts = [dictionary.doc2bow(text, allow_update=False, return_missing=False) for text in preproc_lee_texts] logger.info('transforming small lee corpus (LSI)') corpus_lsi = lsi[pre_model[bow_lee_texts]] # # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): sim_matrix[i, j] = matutils.cossim(par1, par2) sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)] # read the human similarity data and flatten upper triangular human_sim_matrix = np.loadtxt(human_data_file) sim_m_size = np.shape(human_sim_matrix)[0] human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)] # compute correlations cor = np.corrcoef(sim_vector, human_sim_vector) logger.info("correlation with lee human data: %f" % cor[0, 1]) dif = start - datetime.now() logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
def train_dictionary(corpus, fname): make_folder_tree(fname) # Corpus will be examined by unique symbols genes = Dictionary(no_syns[by_token[corpus]], prune_at=None) genes.save(fname)
class LDATagger: _lda_model = None _dictionary = None _lda_model_path = None _dictionary_path = None DEFAULT_MODEL_PATH = os.path.join(os.path.dirname(__file__), "model") DEFAULT_NUM_TOPICS = 1000 def __init__(self, model_path=DEFAULT_MODEL_PATH, num_topics=DEFAULT_NUM_TOPICS, lock=threading.Lock()): self.save_model_lock = lock if os.path.isfile(model_path): raise Exception("Invalid Model Path; Should Be a Directory") if not os.path.exists(model_path): os.makedirs(model_path) self._lda_model_path = os.path.join(model_path, "lda.model") self._dictionary_path = os.path.join(model_path, "tokens.dict") self.num_topics = num_topics self.model_folder_lock = FileLock(model_path) def topics_for_documents(self, doc_tokens_map): self.check_and_load_model() doc_topics_map = defaultdict(list) for document_id, document_tokens in doc_tokens_map.iteritems(): doc_topics_map[document_id] = self.topics_for_document( document_tokens) return doc_topics_map def topics_for_document(self, tokens): self.check_and_load_model() bow_tokens = self._dictionary.doc2bow(tokens) topics = self._lda_model[bow_tokens] return topics def build_topics(self, tokens_list): self._dictionary = Dictionary(tokens_list) corpus = [ self._dictionary.doc2bow(document_tokens) for document_tokens in tokens_list ] self._lda_model = LdaModel(corpus=corpus, id2word=self._dictionary, num_topics=self.num_topics, passes=100) self.save_model() def save_model(self, sleep_for_test=False, mock_datastruct=None): self.save_model_lock.acquire() self.model_folder_lock.acquire() if mock_datastruct: mock_datastruct.acquire() if sleep_for_test: import time time.sleep(1) print "Acquired Lock " try: self._lda_model.save(self._lda_model_path) self._dictionary.save(self._dictionary_path) finally: print "Released Lock" if mock_datastruct: mock_datastruct.release() self.model_folder_lock.release() self.save_model_lock.release() def check_and_load_model(self): if self._lda_model and self._dictionary: return if os.path.exists(self._lda_model_path): self._lda_model = LdaModel.load(self._lda_model_path) else: raise Exception("LDA Model Not found in the path") if os.path.exists(self._dictionary_path): self._dictionary = Dictionary.load(self._dictionary_path) else: raise Exception("Tokens Dictionary Not found in the path") def update_model(self, tokens_list): self.check_and_load_model() corpus = [ self._dictionary.doc2bow(document_tokens) for document_tokens in tokens_list ] self._lda_model.update(corpus=corpus) self.save_model() def build_or_update_model(self, tokens_list): if not self.does_model_exist(): self.build_topics(tokens_list) else: self.update_model(tokens_list) def does_model_exist(self): if os.path.exists(self._lda_model_path) and os.path.exists( self._dictionary_path): return True return False def get_model(self): self.check_and_load_model() model_hash = { "lda_model": cPickle.dumps(self._lda_model), "dictionary": cPickle.dumps(self._dictionary) } return model_hash def restore_model(self, model_hash): self._lda_model = cPickle.loads( model_hash["lda_model"].encode('utf-8')) self._dictionary = cPickle.loads( model_hash["dictionary"].encode('utf-8')) self.save_model() def topics_to_tokens(self): topics_tokens_map = defaultdict(list) if not self.does_model_exist(): return [] else: model = self._lda_model topics_to_tokens = model.show_topics( topics=self.DEFAULT_NUM_TOPICS, topn=25, log=False, formatted=False) for topic_id, tokens in enumerate(topics_to_tokens): topics_tokens_map[topic_id] = self.list_of_tuples_to_hash( tokens) return topics_tokens_map def list_of_tuples_to_hash(self, tokens): tokens_hash = defaultdict(float) for token_probability, token in tokens: tokens_hash[token] = token_probability return tokens_hash
return [token for token in simple_preprocess(text) if token not in stop_words] def iter_wiki(dump_file): """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple.""" ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() for title, text, pageid in _extract_pages(smart_open(dump_file)): text = filter_wiki(text) tokens = tokenize(text) if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces): continue # ignore short articles and various meta-articles yield title, tokens wiki_stream = (tokens for _, tokens in iter_wiki('enwiki-latest-pages-articles.xml.bz2')) print "making of dictionary started" wiki_dictionary = Dictionary(wiki_stream) print "wikipedia dictionary made" wiki_dictionary.filter_extremes(no_below=10, no_above=0.3, keep_n=200000) print "...... saving the dictionary" wiki_dictionary.save('WikiDictionary200k.dict') print "dictionary saved ........" # wiki = WikiCorpus('enwiki-latest-pages-articles.xml.bz2') # make a corpus from wiki dump # MmCorpus.save_corpus('WikiCorpus.mm', wiki) # Saving the corpus
from gensim.models import TfidfModel from gensim.corpora import Dictionary data = [] for filename in glob( r'C:\Users\75043\PycharmProjects\NLP\TF-IDF\corups\THUCNews\体育\*.txt' )[:1100]: with open(filename, encoding='utf-8') as f: # print(filename) text = ' '.join(jieba.cut(f.read().replace('\n', ''))) data.append(text) with open('finance_news_train.json', 'w', encoding='utf-8') as f: json.dump(data[:1000], f, indent=2, ensure_ascii=False) with open('finance_news_test.json', 'w', encoding='utf-8') as f: json.dump(data[1000:], f, indent=2, ensure_ascii=False) with open('finance_news_train.json', encoding='utf-8') as f: data = json.load(f) data = [doc.split() for doc in data ] # the parameter of Dictionaryis iterable of iterable of str dct = Dictionary(data) corpus = [dct.doc2bow(doc) for doc in data] # convert corpus to BoW format # print(corpus[0]) model = TfidfModel(corpus) # fit model dct.save('news.dict') # print(dct[0],dct[1],len(dct),dct) model.save('news_tfidf.model')
class DictionaryPipeline(Pipeline): """Pipeline for creating and updating a gensim dictionary and converting documents to a bag of words representation. """ def __init__(self, *args, **kwargs): """Loads a dictionary for updating """ super().__init__(*args, **kwargs) # This is only for lazy loading. Use get_dict() unless you are sure you # need this. self._dictionary = None @staticmethod def load_dictionary(): """This function is used to load a gensim dictionary from the models folder. Returns: :obj:`gensim.corpora.dictionary.Dictionary`: The dictionary found in ucla_topic_analysis/model/dictionary.gensim or None if there was no dictionary. """ file_name = "dictionary.gensim" file_path = get_training_file_path(file_name) if os.path.isfile(file_path): return Dictionary.load(file_path) return None @staticmethod def get_input_stream(schema=None): """This function is used to get a pipeline to feed into a dictionary for training an LDA model. Args: schema(:obj:`dict`): The schema for the file pipeline Returns: An iterable containing lists of words to train a dictionary with. """ # Build the pipeline files = ReadFilePipeline.get_input_stream() file_stream = ReadFilePipeline(input_stream=files, schema=schema).output_stream() sent_stream = SentencePipeline( input_stream=file_stream).output_stream() word_stream = WordPipeline(input_stream=sent_stream).output_stream() return LemmaPipeline(input_stream=word_stream).output_stream() async def train_dictionary(self): """This function trains a new gensim dictionary from the corpus. """ input_stream = self.get_input_stream() # Train the dictionary count = 1 total = len(get_file_list()) async for data in input_stream: await self.run(data) print_progress(count, total) count += 1 print("") self.save_dict() async def get_dictionary(self): """This function is used to get an instance of a gensim dictionary. It will load a dictionary from file if one has not already been loaded. If no previous dictionary has been loaded and no dictionary has been saved to file it will train a new one. Returns: :obj:`gensim.corpora.dictionary.Dictionary`: The dictionary found in ucla_topic_analysis/model/dictionary.gensim or None if there was no dictionary. """ if self._dictionary is None: self._dictionary = self.load_dictionary() if self._dictionary is None: print("Did not find a saved dictionary. Training one now.") self._dictionary = Dictionary() await self.train_dictionary() return self._dictionary def save_dict(self): """Saves the updated dictionary to file """ file_name = "dictionary.gensim" file_path = get_training_file_path(file_name) self._dictionary.save(file_path) async def coroutine(self, data): """Converts the documents in the data to bags of words Args: data (:obj:`dict`): A dict with the key "text" containing a list of lists with tokenised words that need to be changed to a bag of words format. Returns: :obj:`dict`: The data dict with the value associated with "text" replaced with a list containing a bag of words representation for each document. """ dictionary = await self.get_dictionary() data["text"] = [ dictionary.doc2bow(document, allow_update=True) for document in data["text"] ] self.save_dict() return data
text_data = [] with codecs.open("output.txt", "r", "utf-8") as f: for line in f: tokens = prepare_text_for_lda(line) print(tokens, '\n') text_data.append(tokens) from gensim.corpora import Dictionary dictionary = Dictionary(text_data) corpus = [dictionary.doc2bow(text) for text in text_data] import pickle pickle.dump(corpus, open('corpus.pkl', 'wb')) dictionary.save('dictionary.gensim') import gensim NUM_TOPICS = 4 ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=15) ldamodel.save('model5.gensim') topics = ldamodel.print_topics(num_words=4) for topic in topics: print(topic) dictionary = gensim.corpora.Dictionary.load('dictionary.gensim') corpus = pickle.load(open('corpus.pkl', 'rb')) lda = gensim.models.ldamodel.LdaModel.load('model5.gensim') import pyLDAvis.gensim
# LDA trigram_dictionary_filepath = os.path.join('.','trigram_dict_all_diags.dict') if rerun: trigram_reviews = LineSentence(trigram_records_filepath) # learn the dictionary by iterating over all of the reviews trigram_dictionary = Dictionary(trigram_reviews) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save(trigram_dictionary_filepath) # load the finished dictionary from disk trigram_dictionary = Dictionary.load(trigram_dictionary_filepath) trigram_bow_filepath = os.path.join('.', 'trigram_bow_corpus_all_diags.mm') def trigram_bow_generator(filepath): """ generator function to read reviews from a file and yield a bag-of-words representation """ for review in LineSentence(filepath): yield trigram_dictionary.doc2bow(review)
print(( 'LDA Model based on {3} dataset.\n\tSample Size: {0},\n\tTop {1} Words,\n\tNo of Topics {2}' .format(sample_size, len(dictionary.values()), num_topics, data_scope_name))) LDAmodel_scope = LdaMulticore( corpus=corpus, #mm, id2word=dictionary, num_topics=num_topics, workers=4, chunksize=5000, passes=50, alpha='asymmetric', random_state=random_state) dictionary.save( 'data/model/{0}_dictionary.pkl'.format(research_scope)) #data_scope_name)) LDAmodel_scope.save( 'data/model/{0}'.format(research_scope)) #data_scope_name)) # pickle the model here and insert in SQL LDAmodel_scope = LdaMulticore.load( 'data/model/{0}'.format(research_scope)) #data_scope_name)) # Feature vector df_lda_features(LDAmodel_scope, scope_lda_sample) # Topic distribution RequestTopicDistribution = scope_lda_sample['lda_features'].mean() fig, ax1 = plt.subplots(1, 1, figsize=(20, 6)) nr_top_bars = 5 title_dist = '{}_Request Topic distributions showing top {} bars of {} topics'.format( research_scope, nr_top_bars, num_topics)
r.append(word[0]) rl = (" ".join(r)).strip() return rl with open(data_path, 'w') as f: data = csv.reader(f, delimiter=',') for row in data: results.append(clean_posts(row)) with open(parsed_path, 'w', encoding='utf-8') as fp: fp.write("\n".join(results)) # making of the lda phrases analysis # making of the dictionary for lda topic analysis dict_made = False dict_path = 'dictionary.dict' if dict_made: dictionary = Dictionary.load(dict_path) else: reviews_for_lda = word2vec.LineSentence(reviews_for_lda_filepath) dictionary = Dictionary(reviews_for_lda) dictionary.filter_extremes(no_below=10, no_above=0.4) dictionary.compactify() dictionary.save(dict_path)
tokens = list(filter(None, tokens)) return tokens class Corpus(object): def __iter__(self): for file in glob.glob("*.txt"): print(file) paper = Path(file).read_text(encoding='utf8') yield paper corpus_memory_friendly = Corpus() papers = list(corpus_memory_friendly) texts = [list(preprocess(t)) for t in papers] # define the dictionary: dictionary = Dictionary(texts) dictionary.save('reasoning_corpura.dict') corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize('reasoning_bow.mm', corpus) hash_dictionary = HashDictionary(texts ) hash_dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) hash_dictionary.save_as_text('reasoning_wordids.txt.bz2') hash_dictionary.save('reasoning_corpura_hash.dict')
import json import numpy as np import boto3 s3 = boto3.resource("s3") myBucket = s3.Bucket('workspace.scitodate.com') homedir = os.environ['HOME'] from gensim.corpora import Dictionary from gensim.models import * f = open(homedir + "/results/ontology/c2n.json", 'r') c2n = json.load(f) f.close() prefix = 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#' ncit_dict = [k.split('#')[1] for k in c2n.keys()] dictionary = Dictionary([ncit_dict]) dictionary[0] dictionary.save(homedir + "/results/models/lda_dict") tfidf = TfidfModel.load(homedir + "/results/models/tfidf_model") for i in [2, 3, 4]: lda = AuthorTopicModel(id2word=dictionary.id2token, num_topics=i, eval_every=False) f = open(homedir + "/thesiswork/source/corpus/lda_doc_2k.json", 'r') _corpus = json.load(f) f.close() bow_corpus = [dictionary.doc2bow(doc) for doc in _corpus] tfidf_corpus = [tfidf[doc] for doc in bow_corpus] f = open(homedir + "/thesiswork/source/corpus/lda_d2a_2k.json", 'r') _d2a = json.load(f) f.close() d2a = {}
def listify_commenters(commenters_str): """Transform string of commenters into list""" return commenters_str.split(", ") def get_commenters_set(): """Get list of all unique commenters""" all_commenters = list( set(x for l in commenters_df["commenters"].values for x in l)) all_commenters.remove("") return all_commenters def get_boc(tokens): """Convert list of commenters for each story to "bag of commenters" (boc)""" return dct.doc2bow(tokens) commenters_df = fetch_commentors() dct = Dictionary(commenters_df["commenters"].values) dct.filter_extremes() dct.save('commenters_dct.dict') commenters_df["boc"] = commenters_df["commenters"].apply(get_boc) commenters_ary = commenters_df["boc"].values # Dimensionality reduction of "bag of commenters" with LSI commenters_dimrec_model = LsiModel(corpus=commenters_ary, num_topics=300, id2word=dct) MmCorpus.serialize('commenters_corpus.mm', commenters_dimrec_ary)
class Initialize(): def __init__(self, w2v_dim=300, w2v_window=10, alpha=0.7, sigma=0.3): self.w2v_dim = w2v_dim self.w2v_window = w2v_window self.nlp = spacy.load('nl', disable=['tagger', 'parser', 'ner']) self.category2parent = json.load(open(CATEGORY2PARENT_PATH)) with open(STOPWORDS_PATH) as f: self.stopwords = [word.lower().strip() for word in f.read().split()] print('Parsing questions...') self.init_questions() print('Parsing answers...') self.init_answers() print('Filtering seeds...') self.init_seeds() print('Parsing labeled data...') self.init_labeled_data() print('Parsing corpus...') self.init_corpus() print('Parsing dictionary...') self.init_dictionary() # word2vec print('Initializing Word2Vec...') self.init_word2vec() # translation print('Initializing Translation...') self.init_translation(alpha, sigma) # softcosine print('Initializing Softcosine...') self.init_sofcos() def init_questions(self): # QUESTIONS # if not os.path.exists(NEW_QUESTIONS): self.questions = {} questions = json.load(open(QUESTIONS)) for i, question in enumerate(questions): if i % 1000 == 0: percentage = round(float(i+1) / len(questions), 2) # print('Question Progress: ', percentage, end='\r') text = question['questiontext'] text = list(map(lambda token: str(token), self.nlp(text))) tokens_proc = [w.lower() for w in text] tokens_proc = [w for w in tokens_proc if w not in self.stopwords and w not in punctuation] self.questions[question['id']] = { 'id': question['id'], 'tokens_proc': tokens_proc, 'starcount': question['starcount'], 'answercount': question['answercount'], 'text': question['questiontext'] 'cid': question['cid'] } json.dump(self.questions, open(NEW_QUESTIONS, 'w')) # else: # self.questions = json.load(open(NEW_QUESTIONS)) def init_answers(self): self.answers = {} answers = json.load(open(ANSWERS)) for i, answer in enumerate(answers): if i % 1000 == 0: percentage = round(float(i+1) / len(answers), 2) # print('Answer Progress: ', percentage, end='\r') text = answer['answertext'] text = list(map(lambda token: str(token), self.nlp(text))) tokens_proc = [w.lower() for w in text] tokens_proc = [w for w in tokens_proc if w not in self.stopwords and w not in punctuation] self.answers[answer['id']] = { 'tokens_proc': tokens_proc } json.dump(self.answers, open(NEW_ANSWERS, 'w')) def init_seeds(self): seeds_ = [question for question in self.questions.values() if int(question['answercount']) >= 1] self.seeds = [] for question in seeds_: if int(question['starcount']) >= 1: category = self.category2parent[question['cid']] if question['cid'] in self.category2parent else question['cid'] self.seeds.append({'id': question['id'], 'tokens':question['tokens_proc'], 'text':question['text'], 'category':category}) json.dump(self.seeds, open(SEEDS_PATH, 'w')) def init_labeled_data(self): procdata = json.load(open(TRAINDATA)) self.procdata = {} for i, row in enumerate(procdata): if i % 1000 == 0: percentage = round(float(i+1) / len(procdata), 2) # print('Answer Progress: ', percentage, end='\r') q1id = row['id'] q1_tokens_proc = self.questions[q1id]['tokens_proc'] self.procdata[q1id] = {} for row2 in row['bm25']: score = float(row2['BM25-score']) label = 1 if row2['Lax'] == 'Similar' else 0 q2id = row2['id'] q2_tokens_proc = self.questions[q2id]['tokens_proc'] self.procdata[q1id][q2id] = { 'q1': q1_tokens_proc, 'q2': q2_tokens_proc, 'score': score, 'label': label } qids = list(self.procdata.keys()) shuffle(qids) trainsize = int(0.8 * len(qids)) trainids = qids[:trainsize] self.traindata = {} for qid in trainids: self.traindata[qid] = self.procdata[qid] testids = qids[trainsize:] self.testdata = {} for qid in testids: self.testdata[qid] = self.procdata[qid] json.dump({'procdata': self.procdata, 'train': self.traindata, 'test': self.testdata}, open(NEW_TRAINDATA, 'w')) def init_corpus(self): self.corpus = [] for qid in self.questions: if qid not in self.testdata: question = self.questions[qid] self.corpus.append(question['tokens_proc']) for answer in self.answers.values(): self.corpus.append(answer['tokens_proc']) json.dump({'corpus': self.corpus}, open(CORPUS_PATH, 'w')) def init_dictionary(self): self.dict = Dictionary(self.corpus) # fit dictionary self.dict.save(DICT_PATH) # WORD2VEC def init_word2vec(self): w2v.run(documents=self.corpus, write_path=DATA_PATH, w_dim=self.w2v_dim, window=self.w2v_window) # Softcosine def init_sofcos(self): corpus = [self.dict.doc2bow(line) for line in self.corpus] # convert corpus to BoW format self.tfidf = TfidfModel(corpus) # fit model self.tfidf.save(TFIDF_PATH) # Translation def init_translation(self, alpha, sigma): tokens = [] for question in list(self.corpus): for token in question: tokens.append(token) Q_len = float(len(tokens)) aux_w_Q = self.dict.doc2bow(tokens) aux_w_Q = dict([(self.dict[w[0]], (w[1]+1.0)/(Q_len+len(self.dict))) for w in aux_w_Q]) w_Q = {} for w in aux_w_Q: if w[0] not in w_Q: w_Q[w[0]] = {} w_Q[w[0]][w] = aux_w_Q[w] translation = { 'w_Q': w_Q, 'alpha': alpha, 'sigma': sigma } json.dump(translation, open(TRANSLATION_PATH, 'w'))
dict_token2id = dictionary.token2id tokens = list(dict_token2id.keys()) ne_tokens = [token for token in tokens if token.startswith('ne_')] # find the ids of the ne ne_token_ids = [dict_token2id[token] for token in ne_tokens] ne_token_ids = set(ne_token_ids) # ne term weighting # add max token frequency tuple in documents bow_news = [news + [(-1, max([t[1] for t in news]))] for news in bow_news] # add max token frequency to ne bow_news = [[(t[0], t[1]+news[-1][1]) if t[0] in ne_token_ids else (t[0], t[1]) for t in news] for news in bow_news] # remove last tuple bow_news = [news[:-1] for news in bow_news] dictionary.save(os.path.join(data_dir, 'ne_nedf_weighting.dict')) save_model(bow_news, os.path.join(data_dir, 'ne_nedf_weighting.bow')) endtime = datetime.datetime.now() print('Totol running for ', (endtime - starttime).seconds, ' seconds.') """ 加载需要的Dictionary和bag-of-words文件,调用Gensim中的LDA库训练LDA,每种主题数设置做5词实验 """ dataset = ['20news'] # Set training parameters. # num_topics = 100 num_topics_list = [20,50,100] # num_topics_list = [100] passes_list = [100]
class LDAModelMaker(): def __init__(self, create, texts_filepath, corpus_filepath, dictionary_filepath, lda_filepath, pyldavis_filepath, database=None, **run_parameters): """ Pull up information required for generating the LDA Model. :param {boolean} create: Whether or not we must create texts. :param {str} texts_filepath: Either the location of where to load texts from or where we must save texts to. Explained above. :param {str} corpus_filepath: Location of where we must save or load the corpus from. :param {str} dictionary_filepath: Location of where we must save or load the dictionary from. :param {str} lda_filepath: Location of where we must save the lda model to. :param {str} pyldavis_filepath: Location of where we must save the pyldavis to. :param {str} database: Which kind of database we will be using. """ self.next_steps = {'mongo': self.mongo} self.texts_filepath = texts_filepath self.corpus_filepath = corpus_filepath self.dictionary_filepath = dictionary_filepath self.lda_filepath = lda_filepath self.pyldavis_filepath = pyldavis_filepath self.dictionary = Dictionary() self.run_parameters = run_parameters if create: self.apply = self.create_corpus_dict else: self.apply = self.load_corpus_dict if database: self.database = self.next_steps[database] def create_corpus_dict(self, texts): """ Save texts, dictionary, and corpus. Generate and save lda & pyldavis model. :param {list[list[str]]} texts: Tokenized words """ self.texts = texts self.save_texts() self.set_dict_corp() self.database() self.fit_LDA() def mongo(self): import os from pymongo import MongoClient emailClient = MongoClient() self.db = emailClient[os.getenv('EMAIL_DATABASE_NAME')] self.col = self.db[os.getenv('EMAIL_COLLECTION_NAME')] self.email_database_content() def load_corpus_dict(self, useless): self.texts = self.load_texts() self.dictionary = self.load_dictionary() self.corpus = self.load_corpus() self.fit_LDA() def save_texts(self): """ Save self.texts to a file so we don't have to keep re-computing """ with open(self.texts_filepath, 'wb') as save: pickle.dump(self.texts, save) def load_texts(self): """ Load self.texts from a file it was saved to earlier. """ with open(self.texts_filepath, 'rb') as save: self.texts = pickle.load(save) # print(len(self.texts)) def load_dictionary(self): """ Load self.dictionary from a file it was saved to earlier. """ self.dictionary = self.dictionary.load(self.dictionary_filepath) def load_corpus(self): """ Load self.corpus from a file it was saved to earlier. """ self.corpus = corpora.MmCorpus(self.corpus_filepath) """ TO-DO: For code meant to work with updating the corpus: analyze trade-off. figure out if passing texts as parameter is better than just re-running with complete self.texts If not: don't worry about it """ def set_dict_corp(self): """ AT THE END OF THIS METHOD: self.dictionary will be updated with the new values self.corpus will have a new value that will be an updated version of the previous one. NOTE: if you make new dictionaries and corpuses (?) each time you run through this then the previous data will be lost. you want to keep all the data. ISSUE: when updating the dictionary, new words may be introduced. that means in previous additions to the corpus those words that initially weren't there will only show up for the latter ones. Dictionary is fine, corpus is not. """ self.dictionary.add_documents(self.texts) # self.dictionary.save('../../../Enron/LDAVar/dictionary.dict') self.dictionary.save(self.dictionary_filepath) self.corpus = self.make_corpus() # print(len(self.corpus)) corpora.MmCorpus.serialize(self.corpus_filepath, self.corpus) """ Maybe make an object out of this? Separate class maybe? """ def make_corpus(self): """ Make corpus """ return [self.dictionary.doc2bow(text) for text in self.texts] def fit_LDA(self): """ Fit data in LDA. currently assuming that number of cores remains constant at 1. :param {str} lda_filepath: Where to save lda model to. :param {str} pyldavis_filepath: Where to save pyldavis model to. :param {int} num_topics: Number of topics the LDA model should look for. """ self.lda = ldamodel.LdaModel(corpus=self.corpus, alpha='auto', id2word=self.dictionary, **self.run_parameters) lda_vis_serialized = pyLDAvis.gensim.prepare(self.lda, self.corpus, self.dictionary, sort_topics=False) pyLDAvis.save_html(lda_vis_serialized, self.pyldavis_filepath) self.lda.save(self.lda_filepath) def get_domain(self, list_filtered_emails, email): """ Find domain of the email. :param {str} list_filtered_emails: List of filtered emails. :param {line in database} email: One line in the database """ if list_filtered_emails is None: return None try: domains = [ re.search('@[\w.]+', e).group() for e in list_filtered_emails ] except Exception as e: print(e) return None return domains def email_database_content(self): """ Join each list in self.texts to form strings which can be later manipulated to be stored in the email database. Need to do this if calling setEmailDatabase """ self.texts = [' '.join(text) for text in self.texts] self.set_email_database() def set_email_database(self): """ Add the filtered content of each email and document in corpus to the email database. Now, don't need to serialize self.corpus: it's always in the email database NOTE: It doesn't matter that we're not iterating through the database in the correct numerical order! It's pulling the value from the corresponding index in self.texts """ for email in self.col.find(): try: self.col.update_one( {'_id': email['_id']}, { '$set': { 'filtered_content': self.texts[email['email_counter']], 'email_corpus_value': self.corpus[email['email_counter']] } }, # 'sender_domain': self.getDomain(email['sender_email'], email), # 'recipient_domain': self.getDomain(email['recipient_email'], email)}}, upsert=False) except IndexError: print(email['email_counter']) print(email['_id'])
class MyCorpus(object): ''' Corpus class for streaming review documents ''' def __init__(self, file_list, file_dir, dictionary = None, mindf = MINDF, maxdf = MAXDF, \ maxwords = MAXWORDS, cluster_words = CLUSTER_WORDS, cluster_ul = CLUSTER_UL): self.file_list = file_list # list of cuisine text files self.file_dir = file_dir # directory of cuisine text files self.maxwords = maxwords # maximum number of words to keep after building dictionary from clusters self.cluster_words = cluster_words # maximum number of words to keep from each cluster self.cluster_ul = cluster_ul # upper proportion of reviews to limit for cluster processing self.mindf = mindf # minimum number of documents to keep word self.maxdf = maxdf # max proportion of documents to keep word self.agglomerate = True # return clusters as single documents (True) or return single reviews (False) if dictionary: self.dictionary = dictionary else: self.dictionary = Dictionary() self._build_dict() def __str__(self): return "<MyCorpus at " + str(hex(id(self))) + ">" def __repr__(self): return self.__str__() def _build_dict(self): for filename in self.file_list: dictionary = dict() num_reviews = 0 with open(os.path.join(self.file_dir, filename), "rt") as f: for line in f: num_reviews += 1 words = line[REVIEW_INDEX:].split() for word in set(words): if word not in dictionary: dictionary[word] = 1 else: dictionary[word] += 1 doc = [item for item in dictionary.items() if dictionary[item[0]] > 2 and dictionary[item[0]] / num_reviews < self.cluster_ul] doc.sort(key = lambda x: -x[1]) doc = [word for word, f in doc] self.dictionary.add_documents([doc[:self.cluster_words]]) print("%s added to corpus dictionary!" % (filename,)) self.dictionary.filter_extremes(self.mindf, self.maxdf, self.maxwords) self.dictionary.save("cuisine_dictionary.gensimDict") def __iter__(self): ''' Iterates through cuisines by combining all reviews for each cuisine into a single processed document. Also stores the length of each processed document ''' if self.agglomerate: for filename in self.file_list: with open(os.path.join(self.file_dir, filename), "rt") as f: doc = " ".join([line[REVIEW_INDEX:].rstrip() for line in f]) yield self.dictionary.doc2bow(doc.split()) else: reviewIDs = set() for filename in self.file_list: with open(os.path.join(self.file_dir, filename), "rt") as f: for line in f: id = line[:RATING_INDEX - 1] if id not in reviewIDs: reviewIDs.update([id]) doc = line[REVIEW_INDEX:].rstrip() yield self.dictionary.doc2bow(doc.split())
if w.text == doc[-1].text: skl_texts.append(' '.join(article)) texts.append(article) article = [] count += 1 if count % 100000 == 0: print(count, end=' ') with open("texts.txt", "wb") as fp: # Pickling pickle.dump(texts, fp) print('texts.csv created') bigram = gensim.models.Phrases(texts) dictionary = Dictionary(texts) dictionary.save("hdp_dictionary.dict") print("Dictionary saved as hdp_dictionary.dict") corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize('hdp_corpus.mm', corpus) print('Corpus saved as hdp_corpus.mm') hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) hdpmodel.save('hdp_model_spacy.gensim') print('hdp model created') hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)] hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts,
def main(): logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') parser = OptionParser() parser.add_option('-f', '--corpus-file') parser.add_option('-p', '--parse-procs', default=1, type=int) parser.add_option('-s', '--sublexicalize-procs', default=1, type=int) parser.add_option('-t', '--tfidf-model') parser.add_option('-v', '--vocabulary') parser.add_option('-m', '--model-file') opts, args = parser.parse_args() corpus_fn = opts.corpus_file or sys.exit() n_proc_parse = opts.parse_procs n_proc_sublex = opts.sublexicalize_procs vocab_fn = opts.vocabulary tfidf_fn = opts.tfidf_model model_fn = opts.model_file or sys.exit() with BZ2File(corpus_fn) as f: corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()), order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex, create_dictionary=False) if vocab_fn and os.path.exists(vocab_fn): logging.info("Loading vocabulary from %s" % vocab_fn) vocab = Dictionary.load(vocab_fn) else: logging.info("Creating vocabulary") start = time.clock() vocab = Dictionary(corpus.get_texts()) end = time.clock() logging.info("Vocabulary created in %d seconds" % (end - start)) if vocab_fn: logging.info("Saving dictionary to %s" % vocab_fn) vocab.save(vocab_fn) corpus.dictionary = vocab corpus.dictionary.filter_extremes(no_below=5, no_above=.8) corpus.dictionary.compactify() if tfidf_fn and os.path.exists(tfidf_fn): logging.info("Reading TF-IDF model from %s" % tfidf_fn) tfidf = TfidfModel.load(tfidf_fn) else: logging.info("creating TF-IDF model") tfidf = TfidfModel(corpus) if tfidf_fn: logging.info("Saving TFF-IDF model to %s" % tfidf_fn) tfidf.save(tfidf_fn) bow_corpus = (tfidf[art] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary) model.save(model_fn)
from gensim.models import Phrases from gensim.models.word2vec import LineSentence import pyLDAvis import pyLDAvis.gensim sent = LineSentence('articles.txt') # learn the dictionary article_dict = Dictionary(sent) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) article_dict.filter_extremes(no_below=5, no_above=0.2) article_dict.compactify() article_dict.save('articles.dict') # load the finished dictionary from disk article_dict = Dictionary.load('articles.dict') def bow(filepath, d): # output bag of words representation for review in LineSentence(filepath): yield d.doc2bow(review) # generate bag-of-words representations for all reviews and save them as a matrix MmCorpus.serialize('articles.mm', bow('articles.txt', article_dict)) # load the finished bag-of-words corpus from disk corpus = MmCorpus('articles.mm')
# %% get corpus & dictionary to use for further nlp analysis ''' I suggest to prepare the dictionary and the corpus `once for all' -- that is, dumping the files that, eventually, will be loaded for further analysis. ''' # get dictionary and write it to a file ''' a dictionary is a mapping between words and their integer ids. See Gensim documentation here: https://radimrehurek.com/gensim/corpora/dictionary.html ''' pr_dictionary = Dictionary(docs_phrased) pr_dictionary.save('/tmp/pr_dictionary.dict') # get corpus and write it to a file ''' as per the Gensim documentation, it possible to convert document into the bag-of-words (format = list of (token_id, token_count) tuples) via doc2bow ''' pr_corpus = [pr_dictionary.doc2bow(doc) for doc in docs_phrased] ''' Gensim offers several utilities to write a corpus of text to a file. Personally, I prefer the Matrix Market format [1] [1]: https://math.nist.gov/MatrixMarket/formats.html ''' corpora.MmCorpus.serialize('/tmp/pr_corpus.mm', pr_corpus)
class DocumentTermMatrix(CompactIOMachine): """ Document-term matrix for corpus. This is a class that handles the document-term matrix (DTM). With a given corpus, users can retrieve term frequency, document frequency, and total term frequency. Weighing using tf-idf can be applied. """ def __init__(self, corpus, docids=None, tfidf=False): """ Initialize the document-term matrix (DTM) class with a given corpus. If document IDs (docids) are given, it will be stored and output as approrpriate. If not, the documents are indexed by numbers. Users can choose to weigh by tf-idf. The default is not to weigh. The corpus has to be a list of lists, with each of the inside list contains all the tokens in each document. :param corpus: corpus. :param docids: list of designated document IDs. (Default: None) :param tfidf: whether to weigh using tf-idf. (Default: False) :type corpus: list :type docids: list :type tfidf: bool """ CompactIOMachine.__init__(self, {'classifier': 'dtm'}, 'dtm', dtm_suffices) if docids == None: self.docid_dict = {i: i for i in range(len(corpus))} self.docids = range(len(corpus)) else: if len(docids) == len(corpus): self.docid_dict = {docid: i for i, docid in enumerate(docids)} self.docids = docids elif len(docids) > len(corpus): self.docid_dict = { docid: i for i, docid in zip(range(len(corpus)), docids[:len(corpus)]) } self.docids = docids[:len(corpus)] else: self.docid_dict = {docid: i for i, docid in enumerate(docids)} self.docid_dict = { i: i for i in range(len(docids), range(corpus)) } self.docids = docids + range(len(docids), range(corpus)) # generate DTM self.generate_dtm(corpus, tfidf=tfidf) def generate_dtm(self, corpus, tfidf=False): """ Generate the inside document-term matrix and other peripherical information objects. This is run when the class is instantiated. :param corpus: corpus. :param tfidf: whether to weigh using tf-idf. (Default: False) :return: None :type corpus: list :type tfidf: bool """ self.dictionary = Dictionary(corpus) self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float) bow_corpus = [ self.dictionary.doc2bow(doctokens) for doctokens in corpus ] if tfidf: weighted_model = TfidfModel(bow_corpus) bow_corpus = weighted_model[bow_corpus] for docid in self.docids: for tokenid, count in bow_corpus[self.docid_dict[docid]]: self.dtm[self.docid_dict[docid], tokenid] = count def get_termfreq(self, docid, token): """ Retrieve the term frequency of a given token in a particular document. Given a token and a particular document ID, compute the term frequency for this token. If `tfidf` is set to `True` while instantiating the class, it returns the weighted term frequency. :param docid: document ID :param token: term or token :return: term frequency or weighted term frequency of the given token in this document (designated by docid) :type docid: any :type token: str :rtype: numpy.float """ return self.dtm[self.docid_dict[docid], self.dictionary.token2id[token]] def get_total_termfreq(self, token): """ Retrieve the total occurrences of the given token. Compute the total occurrences of the term in all documents. If `tfidf` is set to `True` while instantiating the class, it returns the sum of weighted term frequency. :param token: term or token :return: total occurrences of the given token :type token: str :rtype: numpy.float """ return sum(self.dtm[:, self.dictionary.token2id[token]].values()) def get_doc_frequency(self, token): """ Retrieve the document frequency of the given token. Compute the document frequency of the given token, i.e., the number of documents that this token can be found. :param token: term or token :return: document frequency of the given token :type token: str :rtype: int """ return len(self.dtm[:, self.dictionary.token2id[token]].values()) def get_token_occurences(self, token): """ Retrieve the term frequencies of a given token in all documents. Compute the term frequencies of the given token for all the documents. If `tfidf` is set to be `True` while instantiating the class, it returns the weighted term frequencies. This method returns a dictionary of term frequencies with the corresponding document IDs as the keys. :param token: term or token :return: a dictionary of term frequencies with the corresponding document IDs as the keys :type token: str :rtype: dict """ return { self.docids[docidx]: count for ( docidx, _), count in self.dtm[:, self.dictionary.token2id[token]].items() } def get_doc_tokens(self, docid): """ Retrieve the term frequencies of all tokens in the given document. Compute the term frequencies of all tokens for the given document. If `tfidf` is set to be `True` while instantiating the class, it returns the weighted term frequencies. This method returns a dictionary of term frequencies with the tokens as the keys. :param docid: document ID :return: a dictionary of term frequencies with the tokens as the keys :type docid: any :rtype: dict """ return { self.dictionary[tokenid]: count for ( _, tokenid), count in self.dtm[self.docid_dict[docid], :].items() } def generate_dtm_dataframe(self): """ Generate the data frame of the document-term matrix. (shorttext <= 1.0.3) Now it raises exception. :return: data frame of the document-term matrix :rtype: pandas.DataFrame :raise: NotImplementedException """ raise NotImplementedException() def savemodel(self, prefix): """ Save the model. :param prefix: prefix of the files :return: None :type prefix: str """ pickle.dump(self.docids, open(prefix + '_docids.pkl', 'wb')) self.dictionary.save(prefix + '_dictionary.dict') pickle.dump(self.dtm, open(prefix + '_dtm.pkl', 'wb')) def loadmodel(self, prefix): """ Load the model. :param prefix: prefix of the files :return: None :type prefix: str """ self.docids = pickle.load(open(prefix + '_docids.pkl', 'rb')) self.docid_dict = {docid: i for i, docid in enumerate(self.docids)} self.dictionary = Dictionary.load(prefix + '_dictionary.dict') self.dtm = pickle.load(open(prefix + '_dtm.pkl', 'rb'))
if __name__ == '__main__': textdirectory = sys.argv[1] files = [f for f in os.listdir(textdirectory) if f.endswith('.conll')] order = json.dumps(files, indent=4, sort_keys=True) orderfile = open('docorder.json', 'w') orderfile.write(order) orderfile.close() texts = [] for doc in files: print(doc, file=sys.stderr) data = open(os.path.join(textdirectory, doc), errors='replace').readlines() text = extract_lemmas(data) texts.append(text) dictionary = Dictionary(texts) dictionary.save('tfidf.dic') corpus = [dictionary.doc2bow(line) for line in texts] model = TfidfModel(corpus, id2word=dictionary) model.save('tfidf.model') sim_index = similarities.MatrixSimilarity(model[corpus]) sim_index.save('tfidf.index')
def main(): parser = ArgumentParser( description= 'wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information' ) parser.add_argument('-ds', '--dataset', default='wiki', help='What kind of dataset to use. (wiki,es,file)') parser.add_argument('-d', '--dump-file', help='Wiki: bz2 dump file with wiki in it') parser.add_argument('-l', '--limit', help='Wiki: How many documents to extract from wiki') parser.add_argument('--model-id', default='model', help='Filename for created model.') parser.add_argument( '--model-type', default='lsi', help='Model type (lsi, lda, word2vec, hdp, vocabulary).') parser.add_argument('--n-topics', default=10, help='Number of topics to model.') parser.add_argument('--n-passes', default=1, help='Number of passes for LDA model.') parser.add_argument('--w2v-size', default=100, help='size of Word2Vec context.') parser.add_argument('--w2v-window', default=5, help='window for Word2Vec.') parser.add_argument('-q', '--query', default=None, help='Elasticsearch: Query to use to fetch documents') parser.add_argument('--index', help='Elasticsearch: index to read from.') parser.add_argument('--doc_type', default='doc', help='Elasticsearch: data type in index.') parser.add_argument( '--data-dir', help='Directory to save the generated models and vocabularies into.') parser.add_argument( '--vocab', help= 'Prebuilt Vocabulary file. Use this to avoid having to generate one.') opts = parser.parse_args() model_type = opts.model_type.lower() if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']: logging.error("Invalid model type %s" % model_type) parser.print_usage() exit(-1) logging.info("Using model type %s" % model_type) dump_fn = opts.dump_file limit = int(opts.limit) if opts.limit else None data_type = opts.dataset.lower() if data_type not in ['es', 'wiki', 'file']: logging.error("Invalid dataset type %s" % data_type) parser.print_usage() exit(-1) limit = None if opts.limit: limit = int(opts.limit) if not dump_fn and data_type in ['wiki']: logging.error('--dump-file required for wiki dataset') sys.exit(1) query = opts.query index = opts.index doc_type = opts.doc_type if data_type == 'es' and index is None: logging.error( "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter" ) sys.exit(1) n_topics = int(opts.n_topics) n_passes = int(opts.n_passes) logging.info("Using %d topics." % n_topics) data_dir = opts.data_dir model_id = opts.model_id model_fn = '%s_%s_%d' % (model_id, model_type, n_topics) if data_dir: model_fn = '%s/%s' % (data_dir, model_fn) if model_type == 'word2vec': w2v_size = int(opts.w2v_size) w2v_window = int(opts.w2v_window) model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size) logging.info("Writing models to %s." % model_fn) if data_type == 'es': logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) dataset = ElasticsearchDataset(read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es) elif data_type == 'wiki': logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) elif data_type == 'file': logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) vocab_file = opts.vocab vocab = Dictionary() sw = set(stopwords.words('norwegian')) if not vocab_file or model_type == 'vocabulary': vocab.add_documents([get_tokenized(page, sw) for page in dataset]) vocab.filter_extremes() vocab.compactify() vocab.save(model_fn + '.vocab') else: vocab = Dictionary.load(vocab_file) if model_type == 'vocabulary': return tfidf = TfidfModel(dictionary=vocab) if model_type == 'lsi': corpus = IterableDataset(dataset, sw, vocab) model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab) elif model_type == 'lda': corpus = IterableDataset(dataset, sw, vocab) model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab) elif model_type == 'word2vec': corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) corpus.dictionary = vocab model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) elif model_type == 'hdp': corpus = IterableDataset(dataset, sw, vocab) model = HdpModel(corpus=tfidf[corpus], id2word=vocab) logging.info(model) model.save(model_fn)
class DMP(object): def __init__(self): self.dic = None self.lda = None self.topic_num = config.getint('dmp', 'topic_num') self.corpus_file = config.get('dmp', 'corpus_file') @staticmethod def __text2doc(iterator, sep=u' '): '''将文本转换为文档 通过 split 函数将文本切成词的列表. 参数 sep: 分隔符 返回 返回已经切割好的词的列表 ''' docs = [] for line in iterator: text = line.strip().split(sep) docs.append(text) return docs def __load_corpus(self): '''读取语料. 通过调用 text2doc 将文本转换为词的列表. 返回 返回处理过后的文档的列表. ''' docs = None with codecs.open(self.corpus_file, 'r', 'utf-8') as iterator: docs = self.__text2doc(iterator) return docs def train(self): '''训练模型, 将会得到词典 (dic) 和模型 (lda) 两个对象. dic: 用来存储词, 每个词会有一个编号. 可以通过 dic[id] 来获取词 lda: 模型, 包含主题的列表. 每个主题有一个编号, 可以通过 lda.print_topic(id) 来获取主题中词的列表 ''' docs = self.__load_corpus() self.dic = Dictionary(docs) bow = [self.dic.doc2bow(doc) for doc in docs] self.lda = LdaModel(bow, id2word=self.dic, num_topics=self.topic_num) def infer(self, doc): '''推断新的文档是什么主题 参数 doc: 新的文档. 要以词的列表的形式呈现 返回 返回主题列表的迭代器, 其中主题均采用编号呈现, 需调用 lda.print_topic 函数来方便人工理解. ''' bow = self.dic.doc2bow(doc) topics = self.lda[bow] return topics def dump(self): '''导出 lda 模型和 dic 词典. ''' lda_file = config.get('dmp', 'lda_file') dic_file = config.get('dmp', 'dic_file') self.lda.save(lda_file) self.dic.save(dic_file) def load(self): '''读取 lda 模型和 dic 词典. ''' lda_file = config.get('dmp', 'lda_file') dic_file = config.get('dmp', 'dic_file') self.lda = LdaModel.load(lda_file) self.dic = Dictionary.load(dic_file)