def dictionaryGen(data_fp, dict_fp): ''' generates a gensim dictionary and saves to dict_fp ''' g = GensimCorpus(data_fp) # loads data object as json and converts to tuple g.loadjson().json2tuple() # tokenize data g.tokenizeData() # create dictionary and filter lower word frequency print 'creating dictionary...' g.createDictionary().filterFrequency(n=1) # save dictionary print 'saving dictionary to %s' % dict_fp g.saveDictionary(dict_fp)
def corpusGen(data_fp, dict_fp, corpus_fp): ''' generates a gensim corpus using gensim dictionary file ''' # load data g = GensimCorpus(data_fp) g.loadjson().json2tuple() # tokenize data g.tokenizeData() # load dictionary g.loadDictionary(dict_fp) # create corpus print 'creating corpus...' g.createCorpus([text for tag, text in g.data]) # save corpus print 'saving corpus to %s' % dict_fp g.saveCorpus(corpus_fp)
def ldaGen(dict_fp, corpus_fp, model_fp, streamParameters=None, batchParameters=None): g = GensimCorpus() dictionary = g.loadDictionary(dict_fp) corpus = g.loadCorpus(corpus_fp) lda = LdaModel(corpus, dictionary) # get params if streamParameters: params = lda.streamParams(**streamParameters) elif batchParameters: params = lda.batchParams(**batchParameters) else: print 'please specify streaming or batch lda' # set params and run model lda.setParams(params) lda.runLda() lda.save_model(model_fp) return lda