def main(): docs = get_docs() texts = make_texts(docs, single=False) questions = get_questions() texts.extend(questions) texts = preprocess_text(texts) texts = [t for t in texts if t] tokens, vocab = preprocess.tokenize(texts, 7500, tag=False, parse=False, entity=False) log.info("Got tokens and vocabulary. Vocab size: %d" % len(vocab)) corpus, flat_corpus, doc_ids, clean_set = make_corpus(tokens=tokens, min_count=50) log.info("Got corpus") # Model Parameters # Number of documents n_docs = len(texts) log.info("number of texts: %d" % n_docs) # Number of unique words in the vocabulary n_words = flat_corpus.max() + 1 # Number of dimensions in a single word vector n_hidden = 128 # Number of topics to fit n_topics = 20 # Get the count for each key counts = corpus.keys_counts[:n_words] # Get the string representation for every compact key words = corpus.word_list(vocab)[:n_words] log.info("Words: \n %s" % words) # Fit the model log.info("fitting the model") model = LDA2Vec(n_words, n_hidden, counts, dropout_ratio=0.2) model.add_categorical_feature(n_docs, n_topics, name='document_id') model.finalize() if os.path.exists('model.hdf5'): serializers.load_hdf5('model.hdf5', model) for _ in range(200): log.info("attempt #%d" % _) model.top_words_per_topic('document_id', words) log.info("TOP_WORDS_PER_TOPIC!\n => ") log.info(model.top_words_per_topic('document_id', words)) log.info('========') model.fit(flat_corpus, categorical_features=[doc_ids], fraction=1e-3, epochs=1) model.to_cpu() serializers.save_hdf5('model.hdf5', model) model.top_words_per_topic('document_id', words)
def get_tokens(): """ :return: """ docs = get_docs() texts = make_texts(docs, single=False) questions = get_questions() texts.extend(questions) texts = preprocess_text(texts) texts = [t for t in texts if t] tokens, vocab = preprocess.tokenize(texts, 7500, tag=False, parse=False, entity=False) return tokens, vocab
def main(): docs = get_docs() texts = make_texts(docs, single=False) questions = get_questions() texts.extend(questions) texts = preprocess_text(texts) texts = [t for t in texts if t] tokens, vocab = preprocess.tokenize(texts, 7500, tag=False, parse=False, entity=False) log.info("Got tokens and vocabulary. Vocab size: %d" % len(vocab)) corpus, flat_corpus, doc_ids, clean_set = make_corpus(tokens=tokens, min_count=50) log.info("Got corpus") # Model Parameters # Number of documents n_docs = len(texts) log.info("number of texts: %d" % n_docs) # Number of unique words in the vocabulary n_words = flat_corpus.max() + 1 # Number of dimensions in a single word vector n_hidden = 128 # Number of topics to fit n_topics = 20 # Get the count for each key counts = corpus.keys_counts[:n_words] # Get the string representation for every compact key words = corpus.word_list(vocab)[:n_words] log.info("Words: \n %s" % words) # Fit the model log.info("fitting the model") model = LDA2Vec(n_words, n_hidden, counts, dropout_ratio=0.2) model.add_categorical_feature(n_docs, n_topics, name="document_id") model.finalize() if os.path.exists("model.hdf5"): serializers.load_hdf5("model.hdf5", model) for _ in range(200): log.info("attempt #%d" % _) model.top_words_per_topic("document_id", words) log.info("TOP_WORDS_PER_TOPIC!\n => ") log.info(model.top_words_per_topic("document_id", words)) log.info("========") model.fit(flat_corpus, categorical_features=[doc_ids], fraction=1e-3, epochs=1) model.to_cpu() serializers.save_hdf5("model.hdf5", model) model.top_words_per_topic("document_id", words)