예제 #1
0
파일: main.py 프로젝트: royshan/gensimLite
def dictionaryGen(data_fp, dict_fp):
    '''
    generates a gensim dictionary
    and saves to dict_fp
    '''
    g = GensimCorpus(data_fp)
    # loads data object as json and converts to tuple
    g.loadjson().json2tuple()
    # tokenize data
    g.tokenizeData()
    # create dictionary and filter lower word frequency
    print 'creating dictionary...'
    g.createDictionary().filterFrequency(n=1)
    # save dictionary
    print 'saving dictionary to %s' % dict_fp
    g.saveDictionary(dict_fp)
예제 #2
0
def dictionaryGen(data_fp, dict_fp):
    '''
    generates a gensim dictionary
    and saves to dict_fp
    '''
    g = GensimCorpus(data_fp)
    # loads data object as json and converts to tuple
    g.loadjson().json2tuple()
    # tokenize data
    g.tokenizeData()
    # create dictionary and filter lower word frequency
    print 'creating dictionary...'
    g.createDictionary().filterFrequency(n=1)
    # save dictionary
    print 'saving dictionary to %s' % dict_fp
    g.saveDictionary(dict_fp)
예제 #3
0
def ldaGen(dict_fp, corpus_fp, model_fp,
           streamParameters=None, batchParameters=None):
    g = GensimCorpus()
    dictionary = g.loadDictionary(dict_fp)
    corpus = g.loadCorpus(corpus_fp)
    lda = LdaModel(corpus, dictionary)

    # get params
    if streamParameters:
        params = lda.streamParams(**streamParameters)
    elif batchParameters:
        params = lda.batchParams(**batchParameters)
    else:
        print 'please specify streaming or batch lda'

    # set params and run model
    lda.setParams(params)
    lda.runLda()
    lda.save_model(model_fp)
    return lda
예제 #4
0
파일: main.py 프로젝트: royshan/gensimLite
def ldaGen(dict_fp,
           corpus_fp,
           model_fp,
           streamParameters=None,
           batchParameters=None):
    g = GensimCorpus()
    dictionary = g.loadDictionary(dict_fp)
    corpus = g.loadCorpus(corpus_fp)
    lda = LdaModel(corpus, dictionary)

    # get params
    if streamParameters:
        params = lda.streamParams(**streamParameters)
    elif batchParameters:
        params = lda.batchParams(**batchParameters)
    else:
        print 'please specify streaming or batch lda'

    # set params and run model
    lda.setParams(params)
    lda.runLda()
    lda.save_model(model_fp)
    return lda
예제 #5
0
파일: main.py 프로젝트: royshan/gensimLite
def corpusGen(data_fp, dict_fp, corpus_fp):
    '''
    generates a gensim corpus using
    gensim dictionary file
    '''
    # load data
    g = GensimCorpus(data_fp)
    g.loadjson().json2tuple()
    # tokenize data
    g.tokenizeData()
    # load dictionary
    g.loadDictionary(dict_fp)
    # create corpus
    print 'creating corpus...'
    g.createCorpus([text for tag, text in g.data])
    # save corpus
    print 'saving corpus to %s' % dict_fp
    g.saveCorpus(corpus_fp)
예제 #6
0
def corpusGen(data_fp, dict_fp, corpus_fp):
    '''
    generates a gensim corpus using
    gensim dictionary file
    '''
    # load data
    g = GensimCorpus(data_fp)
    g.loadjson().json2tuple()
    # tokenize data
    g.tokenizeData()
    # load dictionary
    g.loadDictionary(dict_fp)
    # create corpus
    print 'creating corpus...'
    g.createCorpus([text for tag, text in g.data])
    # save corpus
    print 'saving corpus to %s' % dict_fp
    g.saveCorpus(corpus_fp)