def _create_inferred_model(method: str, train_corpus: tm.TrainingCorpus) -> tm.InferredModel: inferred_model: tm.InferredModel = tm.train_model( train_corpus=train_corpus, method=method, engine_args={ 'n_topics': 4, 'passes': 1, 'random_seed': 42, 'workers': 1, 'max_iter': 100, 'work_folder': f'./tests/output/{uuid.uuid4()}', }, ) return inferred_model
def train(self: TopicModelMixinProtocol, train_corpus: tm.TrainingCorpus) -> tm.InferredModel: inferred_model: tm.InferredModel = tm.train_model( train_corpus=train_corpus, method=self.engine, engine_args=self.engine_args ) os.makedirs(self.target_subfolder, exist_ok=True) inferred_model.topic_model.save(jj(self.target_subfolder, 'gensim.model.gz')) inferred_model.store( folder=self.target_subfolder, store_compressed=self.store_compressed, ) if self.store_corpus: train_corpus.store(self.target_subfolder) return inferred_model
def compute_topic_model_handler(*_): self.model_widgets.output.clear_output() buzy(True) gensim_logger.setLevel(logging.INFO if self.model_widgets. show_trace.value else logging.WARNING) with self.model_widgets.output: try: name: str = str(uuid.uuid1()) # FIXME: Move code block out of GUI (to workflows) target_folder = os.path.join(self.data_folder, name) vectorizer_args = dict( apply_idf=self.model_widgets.apply_idf.value) topic_modeller_args = dict( n_topics=self.model_widgets.n_topics.value, max_iter=self.model_widgets.max_iter.value, learning_method='online', n_jobs=1, ) method = self.model_widgets.method.value train_corpus = tm.TrainingCorpus( corpus=list(self.get_corpus_terms(corpus)), document_index=self.document_index, vectorizer_args=vectorizer_args, ) trained_model: tm.InferredModel = tm.train_model( train_corpus=train_corpus, method=method, engine_args=topic_modeller_args) trained_model.topic_model.save( os.path.join(target_folder, 'gensim.model')) trained_model.store(folder=target_folder, store_compressed=True) train_corpus.store(folder=target_folder) inferred_topics: tm.InferredTopicsData = tm.predict_topics( topic_model=trained_model.topic_model, corpus=train_corpus.corpus, id2token=train_corpus.id2token, document_index=train_corpus.document_index, n_tokens=self.n_tokens, minimum_probability=self.minimum_probability, ) inferred_topics.store(target_folder=target_folder, pickled=False) self.state.update(trained_model=trained_model, inferred_topics=inferred_topics, train_corpus=train_corpus) topics: pd.DataFrame = get_topics_unstacked( self.state.topic_model, n_tokens=100, id2term=self.inferred_topics.id2term, topic_ids=self.inferred_topics.topic_ids, ) display(topics) except Exception as ex: logger.error(ex) self.state.update(inferred_topics=None) raise finally: buzy(False)
def compute( name: str = None, corpus_folder: str = None, corpus_source: str = None, engine: str = "gensim_lda-multicore", engine_args: dict = None, filename_field: str = None, minimum_probability: float = 0.001, n_tokens: int = 200, store_corpus: bool = False, compressed: bool = True, ): if engine not in SUPPORTED_ENGINES: raise ValueError(f"Engine {engine} not supported or deprecated") if corpus_source is None and corpus_folder is None: raise ValueError("corpus filename") if len(filename_field or []) == 0: raise ValueError("corpus filename fields") if corpus_folder is None: corpus_folder, _ = os.path.split(os.path.abspath(corpus_source)) target_folder = os.path.join(corpus_folder, name) os.makedirs(target_folder, exist_ok=True) reader_opts = TextReaderOpts( filename_pattern="*.txt", filename_filter=None, filename_fields=filename_field, ) transform_opts = TextTransformOpts(fix_whitespaces=False, fix_hyphenation=True) tokens_reader = TextTokenizer( source=corpus_source, transform_opts=transform_opts, reader_opts=reader_opts, ) corpus: TokenizedCorpus = TokenizedCorpus(reader=tokens_reader, transform_opts=None) train_corpus: tm.TrainingCorpus = tm.TrainingCorpus( corpus=corpus, corpus_options=dict( reader_opts=reader_opts.props, transform_opts=transform_opts.props, ), ) inferred_model: tm.InferredModel = tm.train_model( train_corpus=train_corpus, method=engine, engine_args=engine_args, ) inferred_model.topic_model.save( os.path.join(target_folder, 'gensim.model.gz')) inferred_model.store(target_folder, store_compressed=compressed) if store_corpus: train_corpus.store(target_folder) inferred_topics: tm.InferredTopicsData = tm.predict_topics( inferred_model.topic_model, corpus=train_corpus.corpus, id2token=train_corpus.id2token, document_index=train_corpus.document_index, minimum_probability=minimum_probability, n_tokens=n_tokens, ) inferred_topics.store(target_folder)
def compute( *, target_name: str = None, corpus_source: str = None, target_folder: str = None, reader_opts: TextReaderOpts = None, text_transform_opts: TextTransformOpts = None, transform_opts: TokensTransformOpts = None, engine: str = "gensim_lda-multicore", engine_args: dict = None, store_corpus: bool = False, store_compressed: bool = True, n_tokens: int = 200, minimum_probability: float = 0.001, ): """ runner """ tokens_reader = TextTokenizer( source=corpus_source, transform_opts=text_transform_opts, reader_opts=reader_opts, ) corpus: TokenizedCorpus = TokenizedCorpus(reader=tokens_reader, transform_opts=transform_opts) train_corpus: tm.TrainingCorpus = tm.TrainingCorpus( corpus=corpus, document_index=corpus.document_index, token2id=corpus.token2id, corpus_options=dict( reader_opts=reader_opts.props, transform_opts=transform_opts.props, ), ) inferred_model: tm.InferredModel = tm.train_model( train_corpus=train_corpus, method=engine, engine_args=engine_args, ) inferred_model.topic_model.save(jj(target_folder, 'gensim.model.gz')) inferred_model.store(target_folder, store_compressed=store_compressed) if store_corpus: train_corpus.store(target_folder) inferred_topics: tm.InferredTopicsData = tm.predict_topics( inferred_model.topic_model, corpus=train_corpus.corpus, id2token=train_corpus.id2token, document_index=train_corpus.document_index, n_tokens=n_tokens, minimum_probability=minimum_probability, ) inferred_topics.store(target_folder) return dict(folder=target_folder, tag=target_name)