class TopicModel(): def __init__(self, train_size: int = TRAIN_SIZE, n_topics: int = NUM_TOPICS, batch_size: int = BATCH_SIZE, shuffle=True): data_loader = TwitterDataset("TRAIN", train_size, batch_size=batch_size, shuffle=shuffle) data_loader.init() self.data_loader = data_loader self.ctm = CTM(input_size=len(self.data_loader.vocab), bert_input_size=512, num_epochs=20, batch_size=MINI_BATCH_SIZE, inference_type="contextual", n_components=n_topics, reduce_on_plateau=True, lr=1e-4, hidden_sizes=HIDDEN_UNITS, num_data_loader_workers=0) def train(self): for i in range(self.data_loader.n_batches): logger.info(f"Starting Batch {i+1}") self.data_loader.encoded_next_batch() training_dataset = CTMDataset(self.data_loader.bow, self.data_loader.bert_embeddings, self.data_loader.idx2token) self.ctm.fit(training_dataset) logger.info("\n---------------------------") logger.info("--------Topics---------") topics = self.ctm.get_topic_lists(10) for t in topics: logger.info(t) logger.info("---------------------------\n") return self.ctm.get_topic_lists(10)
def test_training(data_dir): handler = TextHandler(data_dir + "sample_text_document") handler.prepare() # create vocabulary and training data train_bert = bert_embeddings_from_file( data_dir + 'sample_text_document', "distiluse-base-multilingual-cased") training_dataset = CTMDataset(handler.bow, train_bert, handler.idx2token) ctm = CTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, inference_type="combined", n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_thetas(training_dataset) assert len(thetas) == len(train_bert)
import torch import sys from contextualized_topic_models.models.ctm import CTM from contextualized_topic_models.evaluation.measures import CoherenceNPMI from contextualized_topic_models.utils.data_preparation import TextHandler def show_topics(topic_list): for idx, topic_tokens in enumerate(topic_list): print(idx) print(' '.join(topic_tokens)) if len(sys.argv) < 2: raise Exception("Usage: python {} {}".format(sys.argv[0], "<model_file>")) handler = TextHandler( "contextualized_topic_models/data/wiki/wiki_train_en_prep.txt") handler.prepare() ctm = CTM(input_size=len(handler.vocab), inference_type="contextual", bert_input_size=768, \ num_data_loader_workers=1) ctm.load(sys.argv[1], sys.argv[2]) with open("contextualized_topic_models/data/wiki/wiki_train_en_prep.txt", "r") as en: texts = [doc.split() for doc in en.read().splitlines()] # obtain NPMI coherences on the topic modeled documents show_topics(ctm.get_topic_lists(10)) npmi = CoherenceNPMI(texts=texts, topics=ctm.get_topic_lists(10)) print(npmi.score())
def test_training_all_classes_ctm(data_dir): handler = TextHandler(data_dir + "sample_text_document") handler.prepare() # create vocabulary and training data train_bert = bert_embeddings_from_file( data_dir + 'sample_text_document', "distiluse-base-multilingual-cased") training_dataset = CTMDataset(handler.bow, train_bert, handler.idx2token) ctm = CTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, inference_type="combined", n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_doc_topic_distribution(training_dataset) assert len(thetas) == len(train_bert) ctm = ZeroShotTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_doc_topic_distribution(training_dataset) assert len(thetas) == len(train_bert) ctm = CombinedTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_doc_topic_distribution(training_dataset) assert len(thetas) == len(train_bert) with open(data_dir + 'sample_text_document') as filino: data = filino.readlines() handler = TextHandler(sentences=data) handler.prepare() # create vocabulary and training data train_bert = bert_embeddings_from_list( data, "distiluse-base-multilingual-cased") training_dataset = CTMDataset(handler.bow, train_bert, handler.idx2token) ctm = CTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=1, inference_type="combined", n_components=5) ctm.fit(training_dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 thetas = ctm.get_doc_topic_distribution(training_dataset) assert len(thetas) == len(train_bert) qt = QuickText("distiluse-base-multilingual-cased", text_for_bow=data, text_for_bert=data) dataset = qt.load_dataset() ctm = ZeroShotTM(input_size=len(qt.vocab), bert_input_size=512, num_epochs=1, n_components=5) ctm.fit(dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5 qt_from_conf = QuickText("distiluse-base-multilingual-cased", None, None) qt_from_conf.load_configuration(qt.bow, qt.data_bert, qt.vocab, qt.idx2token) dataset = qt_from_conf.load_dataset() ctm = ZeroShotTM(input_size=len(qt.vocab), bert_input_size=512, num_epochs=1, n_components=5) ctm.fit(dataset) # run the model topics = ctm.get_topic_lists(2) assert len(topics) == 5