def createLsiModelforCorpus(corpusfile, dictfile, numtop): print "\nLoading dictionary..." dict = corpora.Dictionary.load_from_text(dictfile) print(dict) print "\nLoading corpus..." corpus = corpora.MmCorpus(corpusfile) print(corpus) print "\nPerforming Latent Semantic Indexing..." lsi = LsiModel(corpus=corpus, num_topics=numtop, id2word=dict, distributed=False) ## This is the fancy stochastic (aka truncated) SVD, however it throws runtime memory errors for me (e.g. segmentation fault) #lsi = stochastic_svd(corpus,rank=100,num_terms=args.ntopics) corpustopics=lsi.show_topics(num_words=10, log=True, formatted=False) rootdir=os.getcwd() foldername='lsi_output' folderpath=os.path.join(rootdir,foldername) if (os.path.exists(folderpath)==True): shutil.rmtree(folderpath) os.makedirs(folderpath) else: os.makedirs(folderpath) os.chdir(folderpath) lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model' lsi.save(lsimodelfile) filename1= (str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl' with open(filename1,'wb') as output: pickle.dump(corpustopics, output) os.chdir(rootdir) return corpustopics, lsi
def train_model(filename, output_name, data={}): output = data output['dataset'] = filename output['output_name'] = output_name df = pd.read_csv('./data/dataset/%s' % filename) lemmas_list = [] for lemmas in df['lemmas']: lemmas = str(lemmas) lemmas = lemmas.replace('[', '').replace(']', '').replace(',', '').replace('\'', '') lemmas_list.append(lemmas.split()) dictionary = corpora.Dictionary(lemmas_list) make_dir('./data/dicts/') dictionary.save('./data/dicts/%s_corpus.dict' % output_name) output['dict'] = '%s_corpus.dict' % output_name clean_doc = [dictionary.doc2bow(text) for text in lemmas_list] tfidf = models.TfidfModel(clean_doc, normalize=True) lsi = LsiModel(corpus=tfidf[clean_doc], id2word=dictionary, num_topics=200) make_dir('./data/models') lsi.save('./data/models/%s_model.txt' % output_name) output['model'] = '%s_model.txt' % output_name return output
class MyModel: def __init__(self, dict_file=None, corpus_model=None, corpus_file=None): self.dict_file = dict_file self.dictionary = None self.corpus = None if dict_file is not None: self.dictionary = corpora.Dictionary.load(dict_file) if corpus_model: self.corpus = self.corpus_model elif corpus_file: self.corpus = corpora.MmCorpus(corpus_file) self.tf_idf_model = None self.corpus_tf_idf = None self.lsi_model = None self.corpus_lsi = None self.lda_model = None self.corpus_lda = None def tf_idf(self): self.tf_idf_model = models.TfidfModel(corpus=self.corpus, normalize=True) # corpus_vector = [vector for vector in self.corpus] self.corpus_tf_idf = self.tf_idf_model[self.corpus] def lsi(self): self.tf_idf() if self.corpus_tf_idf and self.dictionary: self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2) self.corpus_lsi = self.lsi_model[self.corpus_tf_idf] print self.lsi_model.print_topic(2) elif self.corpus_tf_idf: self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2) self.corpus_lsi = self.lsi_model[self.corpus_tf_idf] def lda(self): self.lda_model = models.LsiModel(corpus=self.corpus) self.corpus_lda = self.lda_model[self.corpus] def add_document_lsi(self, addition_corpus_tf_idf, addition_vector_tf_idf): self.lsi_model.add_documents(addition_corpus_tf_idf) lsi_vector = self.lsi_model[addition_vector_tf_idf] return lsi_vector def save_lsi(self, name='/serialise/model.lsi'): self.lsi_model.save(name) def save_lda(self, name='/serialise/model.lda'): self.lda_model.save(name) @staticmethod def load_lsi(name='/tmp/model.lsi'): my_model = MyModel() my_model.lsi_model = models.LsiModel.load(name) return my_model
def run(): try: print "starting to build LSI Model" start = datetime.now() documents = Feature.objects.exclude(text=None).values_list("text", flat=True) number_of_documents = len(documents) print "number_of_documents:", number_of_documents stopwords = [] stopwords += [month.lower() for month in month_to_number.keys()] stopwords += nltk_stopwords.words('english') print "stopwords:", len(stopwords) with open(path_to_directory_of_this_file + "/stopwords.txt") as f: stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")]) stopwords = set(stopwords) texts = [[word for word in document.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() if word not in stopwords and len(word) > 3 ] for document in documents] counter = Counter() for text in texts: counter.update(text) texts = [[token for token in text if counter[token] > 1] for text in texts] dictionary = Dictionary(texts) print "dictionary:", dictionary dictionary.save(path_to_directory_of_this_file + "/dictionary") corpus = [dictionary.doc2bow(text) for text in texts] print "corpus:", type(corpus) print "generating lsi model" lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10) print "saving LSI model" lsi.save(path_to_directory_of_this_file + "/model") Topic.objects.all().delete() topics = [] for topic in lsi.show_topics(): topics.append(Topic(id=topic[0], name=prettify_topic(topic[1]))) Topic.objects.bulk_create(topics) except Exception as e: print e
def main(): try: dictionary = Dictionary.load_from_text("dictionary.txt") except: dictionary = Dictionary(rcv1_train) dictionary.filter_extremes() dictionary.save_as_text("dictionary.txt") class RCV1BowCorpus(object): def __iter__(self): for document in rcv1_train: yield dictionary.doc2bow(document) ln.debug("Training model on %s documents" % len(rcv1_train)) try: vector_model = LsiModel.load("lsi_model") except: vector_model = LsiModel(corpus=RCV1BowCorpus(), num_topics=100, id2word=dictionary) vector_model.save("lsi_model") def get_lsi_features(text): """ Must return either numpy array or dictionary """ res = vector_model[dictionary.doc2bow(text)] return dict(res) def get_bow_features(text): return dict(dictionary.doc2bow(text)) clf = train_classifier(train_samples=rcv1_train, train_targets=rcv1_train_target, get_features=get_lsi_features, classifier="sgd") evaluate_classifier(clf, rcv1_test, rcv1_test_target, get_features=get_lsi_features)
def get_lsa_model(self, n_topics=50, recalculate=False, from_scratch=True): filepath = self.paths.get_lsa_filepath(n_topics) if not os.path.isfile(filepath) or recalculate: if not from_scratch: raise ValueError('No LSA file exists but from_scratch is False') trigram_dictionary = self.lda_builder.get_corpus_dict() trigram_bow_corpus = self.lda_builder.get_trigram_bow_corpus(trigram_dictionary) print('Building LSA model...') lsi = LsiModel(trigram_bow_corpus, id2word=trigram_dictionary, num_topics=n_topics) lsi.save(filepath) print('LSA model (n_topics={}) written to {}'.format(n_topics, filepath)) else: print('Loading LSA model (n_topics={})...'.format(n_topics)) lsi = LsiModel.load(filepath) return lsi
class TextProcessor: def __init__(self, n_users, n_samples, n_dims): self.nUsers, self.nSamples, self.nDims = n_users, n_samples, n_dims self.tfIdfModel = self.lsiModel = self.ldaModel = self.w2vModel = self.dictionary = None self.dictPath, self.tfIdfPath, self.lsiPath, self.ldaPath, self.w2vPath, self.w2vVecPath =\ conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='dict'), \ conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='tfidf'),\ conf.get_filename_via_tpl('model', model_type='lsi', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lsi_model'), \ conf.get_filename_via_tpl('model', model_type='lda', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lda_model'),\ conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='w2vmodel'), \ conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='vec.txt') def load_model(self, model_type): model = None try: if model_type == 'tfidf': model = TfidfModel.load(self.tfIdfPath, mmap='r') self.tfIdfModel = model elif model_type == 'lsi': model = LsiModel.load(self.lsiPath, mmap='r') self.lsiModel = model elif model_type == 'lda': model = LdaModel.load(self.ldaPath, mmap='r') self.ldaModel = model elif model_type == 'w2v': model = Word2Vec.load(self.w2vPath, mmap='r') self.w2vModel = model else: logger.error('Model type error. Unexpected %s' % model_type) return None if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) logger.info('%s model loaded completely.' % model_type) except IOError: logger.error( 'The %s model doesn\'t exist. Please train the model before load it.' % model_type) finally: return model def tf_idf_transform(self, doc): """ Perform tf-idf transformation on doc. """ self.dictionary = corpora.Dictionary(doc) corpus = [self.dictionary.doc2bow(text) for text in doc] self.tfIdfModel = TfidfModel(corpus) conf.mk_dir(self.tfIdfPath) self.dictionary.save(self.dictPath) logger.info('Dictionary has been saved in %s.' % self.dictPath) self.tfIdfModel.save(self.tfIdfPath) logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath) tfidf_corpus = self.tfIdfModel[corpus] tfidf_corpus_path = conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples) corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus) logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' % (np.array(tfidf_corpus).shape, tfidf_corpus_path)) return tfidf_corpus def lsi_transform(self, corpus_tf_idf): logger.info('Training lsi model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) self.lsiModel = LsiModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) # print self.lsiModel[corpus] conf.mk_dir(self.lsiPath) self.lsiModel.save(self.lsiPath) logger.info('Lsi model has been saved in %s.' % self.lsiPath) lsi_corpus = self.lsiModel[corpus_tf_idf] lsi_corpus_path = conf.get_filename_via_tpl('lsi', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lsi_corpus_path) corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus) logger.info('Lsi corpus with a shape of %s has been saved in %s.' % (np.array(lsi_corpus).shape, lsi_corpus_path)) return lsi_corpus def lda_transform(self, corpus_tf_idf, train_separated=False, is_update=False): """ Init a lda model with a n_topics whose default is 500, then fit it with corpus_tf_idf and transform it. :param corpus_tf_idf: Corpus which has been transformed into tf-idf matrix. :param train_separated: The model is going to be train with all corpus one time or some of them separately one time. :param is_update: Whether the training to be perform is to construct a new model or update one existed. :return: lda corpus. """ logger.info('Training lda model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) if is_update: # A ldaModel had been trained before and now update the model with other corpus. if self.ldaModel is None: self.load_model('lda') self.ldaModel.update(corpus_tf_idf) logger.info('Lda model has been updated successfully.') return self.ldaModel[corpus_tf_idf] if train_separated: # corpus = [] # spacing = 10000 # for i in range(int(len(corpus_tf_idf)/spacing)): # corpus.append(corpus_tf_idf[i*spacing: i]) # self.ldaModel = LdaModel() pass self.ldaModel = LdaModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) conf.mk_dir(self.ldaPath) self.ldaModel.save(self.ldaPath) logger.info('lda model has been saved in %s' % self.ldaPath) lda_corpus = self.ldaModel[corpus_tf_idf] lda_corpus_path = conf.get_filename_via_tpl('lda', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lda_corpus_path) corpora.MmCorpus.serialize(lda_corpus_path, lda_corpus) logger.info('Lda corpus with a shape of %s has been saved in %s.' % (np.array(lda_corpus).shape, lda_corpus_path)) return lda_corpus def w2v_transform(self, sentences): """ Perform word2vec on texts and obtain a w2v model. :param sentences: Sentences that each one of it contains a list of words of a text. :return: W2v model. """ logger.info('Training w2v model with a dim of %d...' % self.nDims) # file = open(infile_path, 'r', encoding='utf-8') if infile_path.find('\n') < 0 else StringIO(infile_path) # sentences = [] # for sen in file.readlines(): # sentences.append(sen.strip().split(' ')) # print(sentences) self.w2vModel = Word2Vec(sentences, size=self.nDims, min_count=0) conf.mk_dir(self.w2vPath) self.w2vModel.save(self.w2vPath) self.w2vModel.wv.save_word2vec_format(self.w2vVecPath, binary=False) # print(model['[']) # Construct w2v corpus w2v_corpus = [] for sen in sentences: vec = [0] * self.nDims if len(sen) > 0: for word in sen: vec = list( map(lambda m, n: m + n, vec, self.w2vModel[word])) # vec += self.w2vModel[word] w2v_corpus.append(vec) w2v_corpus_path = conf.get_filename_via_tpl('w2v', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims) conf.mk_dir(w2v_corpus_path) with open(w2v_corpus_path, 'w') as fp: csv_writer = csv.writer(fp) for line in w2v_corpus: csv_writer.writerow(line) logger.info('W2v corpus has been saved in %s. ' % w2v_corpus_path) return w2v_corpus def load_corpus(self, model_type, dense=False): corpus = None try: if model_type == 'tfidf': corpus = corpora.MmCorpus( conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples)) elif model_type in ['lsi', 'lda']: corpus = corpora.MmCorpus( conf.get_filename_via_tpl(model_type, n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm')) elif model_type == 'w2v': corpus = np.loadtxt(conf.get_filename_via_tpl( model_type, n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims), dtype=np.float, delimiter=',') logger.info('%s corpus with a shape of %s has been loaded. ' % (model_type, np.array(corpus).shape)) if dense and model_type in ['tfidf', 'lsi', 'lda']: corpus = matutils.corpus2dense(corpus, self.nDims, self.nSamples * self.nUsers, dtype=np.float).T else: corpus = np.array(corpus) except Exception as e: raise e return corpus @staticmethod def corpus2dense(corpus, n_terms, n_docs=conf.N_SAMPLES, dtype=np.float): return matutils.corpus2dense(corpus, n_terms, n_docs, dtype).T def load_vec(self, vec_type): logger.info('Loading %s vectors...' % vec_type) try: corpus_vec = self.load_corpus(vec_type, True) except Exception as e: raise e data = [] for i in range(self.nUsers): data.append(corpus_vec[i * self.nSamples:(i + 1) * self.nSamples]) data = np.array(data, dtype=np.float) return data
def main(): parser = ArgumentParser( description= 'wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information' ) parser.add_argument('-ds', '--dataset', default='wiki', help='What kind of dataset to use. (wiki,es,file)') parser.add_argument('-d', '--dump-file', help='Wiki: bz2 dump file with wiki in it') parser.add_argument('-l', '--limit', help='Wiki: How many documents to extract from wiki') parser.add_argument('--model-id', default='model', help='Filename for created model.') parser.add_argument( '--model-type', default='lsi', help='Model type (lsi, lda, word2vec, hdp, vocabulary).') parser.add_argument('--n-topics', default=10, help='Number of topics to model.') parser.add_argument('--n-passes', default=1, help='Number of passes for LDA model.') parser.add_argument('--w2v-size', default=100, help='size of Word2Vec context.') parser.add_argument('--w2v-window', default=5, help='window for Word2Vec.') parser.add_argument('-q', '--query', default=None, help='Elasticsearch: Query to use to fetch documents') parser.add_argument('--index', help='Elasticsearch: index to read from.') parser.add_argument('--doc_type', default='doc', help='Elasticsearch: data type in index.') parser.add_argument( '--data-dir', help='Directory to save the generated models and vocabularies into.') parser.add_argument( '--vocab', help= 'Prebuilt Vocabulary file. Use this to avoid having to generate one.') opts = parser.parse_args() model_type = opts.model_type.lower() if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']: logging.error("Invalid model type %s" % model_type) parser.print_usage() exit(-1) logging.info("Using model type %s" % model_type) dump_fn = opts.dump_file limit = int(opts.limit) if opts.limit else None data_type = opts.dataset.lower() if data_type not in ['es', 'wiki', 'file']: logging.error("Invalid dataset type %s" % data_type) parser.print_usage() exit(-1) limit = None if opts.limit: limit = int(opts.limit) if not dump_fn and data_type in ['wiki']: logging.error('--dump-file required for wiki dataset') sys.exit(1) query = opts.query index = opts.index doc_type = opts.doc_type if data_type == 'es' and index is None: logging.error( "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter" ) sys.exit(1) n_topics = int(opts.n_topics) n_passes = int(opts.n_passes) logging.info("Using %d topics." % n_topics) data_dir = opts.data_dir model_id = opts.model_id model_fn = '%s_%s_%d' % (model_id, model_type, n_topics) if data_dir: model_fn = '%s/%s' % (data_dir, model_fn) if model_type == 'word2vec': w2v_size = int(opts.w2v_size) w2v_window = int(opts.w2v_window) model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size) logging.info("Writing models to %s." % model_fn) if data_type == 'es': logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) dataset = ElasticsearchDataset(read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es) elif data_type == 'wiki': logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) elif data_type == 'file': logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) vocab_file = opts.vocab vocab = Dictionary() sw = set(stopwords.words('norwegian')) if not vocab_file or model_type == 'vocabulary': vocab.add_documents([get_tokenized(page, sw) for page in dataset]) vocab.filter_extremes() vocab.compactify() vocab.save(model_fn + '.vocab') else: vocab = Dictionary.load(vocab_file) if model_type == 'vocabulary': return tfidf = TfidfModel(dictionary=vocab) if model_type == 'lsi': corpus = IterableDataset(dataset, sw, vocab) model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab) elif model_type == 'lda': corpus = IterableDataset(dataset, sw, vocab) model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab) elif model_type == 'word2vec': corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) corpus.dictionary = vocab model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) elif model_type == 'hdp': corpus = IterableDataset(dataset, sw, vocab) model = HdpModel(corpus=tfidf[corpus], id2word=vocab) logging.info(model) model.save(model_fn)
def run(): try: print("starting to build LSI Model") start = datetime.now() documents = Feature.objects.exclude(text=None).values_list("text", flat=True) number_of_documents = len(documents) print("number_of_documents:", number_of_documents) texts = [tokenize(document) for document in documents] counter = Counter() for text in texts: counter.update(text) texts = [[token for token in text if counter[token] > 1] for text in texts] print("texts:", len(texts), texts[:5]) dictionary = Dictionary(texts) #print "dictionary:", dictionary dictionary.save(path_to_directory_of_this_file + "/dictionary") corpus = [dictionary.doc2bow(text) for text in texts] print("corpus:", type(corpus)) print("generating lsi model") lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10) print("saving LSI model") lsi.save(path_to_directory_of_this_file + "/model") # nullifyin all topics on features and places Feature.objects.exclude(topic=None).update(topic=None) Place.objects.exclude(topic=None).update(topic=None) Topic.objects.all().delete() print("deleted all topics") topics = [] for topic in lsi.show_topics(): topics.append(Topic(id=topic[0], name=prettify_topic(topic[1]))) Topic.objects.bulk_create(topics) print("bulk created all topics") """ # re-create topics for all features in database for feature in Feature.objects.exclude(text=None).exclude(text=""): words = tokenize(feature.text) if words: probabilities = lsi[dictionary.doc2bow(words)] if probabilities: topic_id = sorted(probabilities, key=lambda tup: -1*tup[1])[0][0] if topic_id: feature.topic_id = topic_id feature.save() # assign as topic to each place based on most popular topic found in features for place_id in Place.objects.exclude(featureplace=None).values_list("id", flat=True): counter = Counter(Feature.objects.filter(featureplace__place_id=place_id).values_list("topic_id")) print "counter:", counter """ except Exception as e: print(e)
# corpus_test_word_seg_tfidf = model.__getitem__(corpus_test_word_seg) # corpora.MmCorpus.serialize('../topic_model/corpus_test_word_seg_tfidf', corpus_test_word_seg_tfidf) corpus_train_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_train_word_seg_tfidf') corpus_dev_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_dev_word_seg_tfidf') corpus_test_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_test_word_seg_tfidf') corpus_word_seg_tfidf = [] corpus_word_seg_tfidf.extend(corpus_train_word_seg_tfidf) corpus_word_seg_tfidf.extend(corpus_dev_word_seg_tfidf) corpus_word_seg_tfidf.extend(corpus_test_word_seg_tfidf) # lsi print('Start train lsi...') lsi_model = LsiModel(corpus=corpus_word_seg_tfidf, id2word=dictionary_word_seg, num_topics=400) lsi_model.save('../topic_model/word_seg_lsi_model') corpus_train_word_seg_lsi = lsi_model[corpus_train_word_seg_tfidf] corpus_dev_word_seg_lsi = lsi_model[corpus_dev_word_seg_tfidf] corpus_test_word_seg_lsi = lsi_model[corpus_test_word_seg_tfidf] corpora.MmCorpus.serialize('../topic_model/corpus_train_word_seg_lsi', corpus_train_word_seg_lsi) corpora.MmCorpus.serialize('../topic_model/corpus_dev_word_seg_lsi', corpus_dev_word_seg_lsi) corpora.MmCorpus.serialize('../topic_model/corpus_test_word_seg_lsi', corpus_test_word_seg_lsi) #lda print('Start train lda...') lda_model = LdaModel(corpus=corpus_word_seg_tfidf, id2word=dictionary_word_seg, num_topics=100, update_every=1, chunksize=1000, passes=1) lda_model.save('../topic_model/word_seg_lda_model') corpus_train_word_seg_lda = lda_model[corpus_train_word_seg_tfidf] corpus_dev_word_seg_lda = lda_model[corpus_dev_word_seg_tfidf] corpus_test_word_seg_lda = lda_model[corpus_test_word_seg_tfidf]
def main(): parser = ArgumentParser( description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information" ) parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)") parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it") parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki") parser.add_argument("--model-id", default="model", help="Filename for created model.") parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).") parser.add_argument("--n-topics", default=10, help="Number of topics to model.") parser.add_argument("--n-passes", default=1, help="Number of passes for LDA model.") parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.") parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.") parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents") parser.add_argument("--index", help="Elasticsearch: index to read from.") parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.") parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.") parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.") opts = parser.parse_args() model_type = opts.model_type.lower() if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]: logging.error("Invalid model type %s" % model_type) parser.print_usage() exit(-1) logging.info("Using model type %s" % model_type) dump_fn = opts.dump_file limit = int(opts.limit) if opts.limit else None data_type = opts.dataset.lower() if data_type not in ["es", "wiki", "file"]: logging.error("Invalid dataset type %s" % data_type) parser.print_usage() exit(-1) limit = None if opts.limit: limit = int(opts.limit) if not dump_fn and data_type in ["wiki"]: logging.error("--dump-file required for wiki dataset") sys.exit(1) query = opts.query index = opts.index doc_type = opts.doc_type if data_type == "es" and index is None: logging.error( "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter" ) sys.exit(1) n_topics = int(opts.n_topics) n_passes = int(opts.n_passes) logging.info("Using %d topics." % n_topics) data_dir = opts.data_dir model_id = opts.model_id model_fn = "%s_%s_%d" % (model_id, model_type, n_topics) if data_dir: model_fn = "%s/%s" % (data_dir, model_fn) if model_type == "word2vec": w2v_size = int(opts.w2v_size) w2v_window = int(opts.w2v_window) model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size) logging.info("Writing models to %s." % model_fn) if data_type == "es": logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) dataset = ElasticsearchDataset( read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es ) elif data_type == "wiki": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) elif data_type == "file": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) vocab_file = opts.vocab vocab = Dictionary() sw = set(stopwords.words("norwegian")) if not vocab_file or model_type == "vocabulary": vocab.add_documents([get_tokenized(page, sw) for page in dataset]) vocab.filter_extremes() vocab.compactify() vocab.save(model_fn + ".vocab") else: vocab = Dictionary.load(vocab_file) if model_type == "vocabulary": return tfidf = TfidfModel(dictionary=vocab) if model_type == "lsi": corpus = IterableDataset(dataset, sw, vocab) model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab) elif model_type == "lda": corpus = IterableDataset(dataset, sw, vocab) model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab) elif model_type == "word2vec": corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) corpus.dictionary = vocab model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) elif model_type == "hdp": corpus = IterableDataset(dataset, sw, vocab) model = HdpModel(corpus=tfidf[corpus], id2word=vocab) logging.info(model) model.save(model_fn)
pp.pprint(sortedqueryresult) #pp.pprint(sortedqueryresult[:10]) rootdir=os.getcwd() foldername='lsi_output' folderpath=os.path.join(rootdir,foldername) if (os.path.exists(folderpath)==True): shutil.rmtree(folderpath) os.makedirs(folderpath) else: os.makedirs(folderpath) os.chdir(folderpath) lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model' lsi.save(lsimodelfile) filename1= (str(args.corpus).replace('.mm',''))+'_lsi_topics.txt' filename2= (str(args.corpus).replace('.mm',''))+('_item_{0}_classification.txt'.format(args.query)) f = open(filename1,'w') f.seek(0) f.write(str(corpustopics)) f.close() f = open(filename2,'w') f.seek(0) f.write(str(queryresult)) f.close() os.chdir(rootdir) end_time=time.time() runtime=end_time-start_time
class Feature(catscorpus.CatsCorpus, utils.Config): """ """ def __init__(self, root_path, is_tfidf=False): catscorpus.CatsCorpus.__init__(self, root_path=root_path) # Select training corpus self.is_tfidf = is_tfidf if self.is_tfidf: self.training_corpus = self.tfidf # Take tfidf matrxi as input else: self.training_corpus = self.corpus # Take bow corpus as input def encoder_lda(self, num_topics=100, chunksize=500): """ """ self.num_topics = num_topics # Train LDA based on training dataset self.lda = LdaModel(corpus=self.training_corpus, id2word=self.dictionary, \ num_topics=num_topics, update_every=1, chunksize=chunksize, passes=1) # Convert bow into topic vectors self.corpus_lda = self.lda[self.training_corpus] def encoder_lsi(self, num_components=100, chunksize=500, is_tfidf=False): """ """ self.num_components = num_components # Train LSI based on training dataset self.lsi = LsiModel(corpus=self.training_corpus, id2word=self.dictionary, \ num_topics=num_components, chunksize=chunksize) # initialize an LSI transformation # Convert bow into LSI projections self.corpus_lsi = self.lsi[self.training_corpus] def encoder_gbrbm(self, n_hidden=1000, lr=0.01, n_epoches=10, batch_size=100): """ """ n_visible = len(self.dictionary) training_dataset = corpus2dense(self.training_corpus, num_terms=n_visible).transpose() self.rbm = GBRBM(n_visible, n_hidden=n_hidden, learning_rate=lr, momentum=0.95, \ err_function='mse', use_tqdm=False, sample_visible=False, sigma=1) self.rbm.fit(training_dataset, n_epoches=n_epoches, batch_size=batch_size, \ shuffle=True, verbose=True) self.corpus_rbm = self.rbm.transform(training_dataset) def save_gbrbm(self, model_path=None, output_path=None): """ """ model_path = "%s/%s" % (model_path, "model") output_path = "%s/%s" % (output_path, "npy.mat.txt") # if model_path: # self.rbm.save(model_path) if output_path: # numpy_matrix = corpus2dense(self.corpus_lda, num_terms=self.num_topics) np.savetxt(output_path, self.corpus_rbm, delimiter=',') def save_lda(self, model_path=None, output_path=None): """ """ model_path = "%s/%s" % (model_path, "model") output_path = "%s/%s" % (output_path, "npy.mat.txt") if model_path: self.lda.save(model_path) if output_path: numpy_matrix = corpus2dense(self.corpus_lda, num_terms=self.num_topics).transpose() np.savetxt(output_path, numpy_matrix, delimiter=',') def save_lsi(self, model_path=None, output_path=None): """ """ model_path = "%s/%s" % (model_path, "model") output_path = "%s/%s" % (output_path, "npy.mat.txt") if model_path: self.lsi.save(model_path) if output_path: numpy_matrix = corpus2dense(self.corpus_lsi, num_terms=self.num_components).transpose() np.savetxt(output_path, numpy_matrix, delimiter=',') def random_sampling(self, num_samples): catscorpus.CatsCorpus.random_sampling(self, num_samples) # Select training corpus if self.is_tfidf: self.training_corpus = self.tfidf # Take tfidf matrxi as input else: self.training_corpus = self.corpus # Take bow corpus as input def category_sampling(self, categories): catscorpus.CatsCorpus.category_sampling(self, categories) # Select training corpus if self.is_tfidf: self.training_corpus = self.tfidf # Take tfidf matrxi as input else: self.training_corpus = self.corpus # Take bow corpus as input def __iter__(self): pass