示例#1
0
文件: NIPS.py 项目: csa0001/Refinery
def get_minibatch_iterator(nBatch=10, nLap=1, 
                           dataorderseed=0, **kwargs):
    Data = WordsData.read_from_mat(matfilepath)
    DataIterator = AdmixMinibatchIterator(Data, nBatch=nBatch,
                    nLap=nLap, dataorderseed=dataorderseed)
    DataIterator.summary = get_data_info(Data.nDocTotal, Data.vocab_size)
    return DataIterator
示例#2
0
def get_data(seed=8675309, nObsTotal=25000, **kwargs):
    ''' Grab data from database to initialize (used only once really)
    '''
    doc_id_select = range(1,500) # grab the first 500 documents to initialize
    nDoc = len(doc_id_select)
    query = 'select * from data where rowid in (' + ','.join(map(str, doc_id_select)) + ')'
    Data = WordsData.read_from_db( dbpath, query, nDoc=nDoc, nDocTotal = nDoc, vocab_size = V )
    Data.summary = get_data_info(Data.nDocTotal, Data.vocab_size)
    return Data
def get_test_data(seed=6789, nDocTotal=100, **kwargs):
    ''' Create dataset of "heldout" docs, for testing purposes.

    Uses different random seed than get_data, but otherwise similar.
    '''
    updateKwArgsWithDefaults(kwargs)
    kwargs['seed'] = seed
    kwargs['nDocTotal'] = nDocTotal
    Data = WordsData.CreateToyDataFromMixModel(**kwargs)
    Data.name = get_short_name()
    Data.summary = get_data_info()
    return Data
示例#4
0
def get_data(**kwargs):
    ''' Create and return dataset.

        Keyword Args
        -------
        nDocTotal
        nWordsPerDoc
    '''
    updateKwArgsWithDefaults(kwargs)
    Data = WordsData.CreateToyDataFromMixModel(**kwargs)
    Data.name = get_short_name()
    Data.summary = get_data_info()
    return Data
示例#5
0
def get_data(seed=8675309, nObsTotal=25000, **kwargs):
    ''' Grab data from database to initialize (used only once really)
    '''
    doc_id_select = range(1, 500)  # grab the first 500 documents to initialize
    nDoc = len(doc_id_select)
    query = 'select * from data where rowid in (' + ','.join(
        map(str, doc_id_select)) + ')'
    Data = WordsData.read_from_db(dbpath,
                                  query,
                                  nDoc=nDoc,
                                  nDocTotal=nDoc,
                                  vocab_size=V)
    Data.summary = get_data_info(Data.nDocTotal, Data.vocab_size)
    return Data
示例#6
0
def get_minibatch_iterator(seed=8675309,
                           nBatch=10,
                           nObsBatch=None,
                           nObsTotal=25000,
                           nLap=1,
                           allocModelName=None,
                           dataorderseed=0,
                           **kwargs):
    Data = WordsData.read_from_mat(matfilepath)
    DataIterator = AdmixMinibatchIterator(Data,
                                          nBatch=nBatch,
                                          nObsBatch=nObsBatch,
                                          nLap=nLap,
                                          dataorderseed=dataorderseed)
    DataIterator.summary = get_data_info(Data.nDocTotal, Data.vocab_size)
    return DataIterator
示例#7
0
def get_minibatch_iterator(seed=8675309,
                           nBatch=10,
                           nObsBatch=None,
                           nObsTotal=25000,
                           nLap=1,
                           allocModelName=None,
                           dataorderseed=0,
                           **kwargs):
    words_dict = get_BoW(seed)
    Data = WordsData(**words_dict)
    DataIterator = AdmixMinibatchIterator(Data,
                                          nBatch=nBatch,
                                          nObsBatch=nObsBatch,
                                          nLap=nLap,
                                          dataorderseed=dataorderseed)
    DataIterator.summary = get_data_info()
    return DataIterator
示例#8
0
文件: NIPS.py 项目: csa0001/Refinery
def get_data(**kwargs):
    ''' Grab data from matfile specified by matfilepath
    '''
    Data = WordsData.read_from_mat(matfilepath)
    Data.summary = get_data_info(Data.nDocTotal, Data.vocab_size)
    return Data
示例#9
0
def CreateToyDataFromLDAModel(**kwargs):
    for key in Defaults:
        if key not in kwargs:
            kwargs[key] = Defaults[key]
    return WordsData.CreateToyDataFromLDAModel(**kwargs)
示例#10
0
def run_topic_modeling(username,folder_id,ex_id):

    d = Folder.query.get(folder_id)
    ex = Experiment.query.get(ex_id)
    set_tm_status(username,folder_id, ex,'inprogress')
    db.session.commit()
    
    exinfo = ex.getExInfo()
        
    # CREATE WORD DATA
    datafile = d.wordcount_path()
    vocabfile = d.vocab_path()

    vocab = {}
    idx = 0
    vv = [x.strip() for x in open(vocabfile,'r')]
    for v in vv:
        vocab[idx] = v
        idx += 1
        
    lines = [x.strip().split(",") for x in open(datafile,'r')]
        
    docrange = []
    word_id = []
    word_count = []
    start = 0
    cur = -1
    curD = -1
    for l in lines:
        cur += 1
        word_id.append(int(l[1]))
        word_count.append(int(l[2]))

        dID = int(l[0])
        if(curD == -1):
            curD = dID
        if(curD != dID):
            docrange.append([start,cur-1])
            start = cur-1
            curD = dID
    docrange.append([start,cur+1])
    data = WordsData(word_id,word_count,docrange,len(vocab),vocab,len(docrange))

    # RUN Topic Modeling in BNPY

    a = {"tm_id":str(d.id), "username":username}

    hmodel = bnpy.Run.run(data, 'HDPModel', 'Mult', 'VB', doSaveToDisk=False, K=exinfo.nTopics,
                          nLap=100, initname="randomfromprior",
                          customFuncPath="refinery/webapp/", customFuncArgs=json.dumps(a))
    '''
                          moves='birth,merge', birthPerLap=10, \
                          mergePerLap=10, nFreshLap=25)
    '''

    
    exinfo.viz_data = getModelState(hmodel[0],hmodel[1],100)

    set_tm_status(username,folder_id, ex,'finish')

    db.session.commit()
示例#11
0
def genWordsData(**kwargs):
    for key in Defaults:
        if key not in kwargs:
            kwargs[key] = Defaults[key]
    return WordsData.genToyData(**kwargs)
示例#12
0
def get_data_info():
    s = 'Toy Bars Data with %d true topics. Each doc uses ONE topic.' % (K)
    return s


def get_data(**kwargs):
    ''' Create and return dataset.

        Keyword Args
        -------
        nDocTotal
        nWordsPerDoc
    '''
    updateKwArgsWithDefaults(kwargs)
    Data = WordsData.CreateToyDataFromMixModel(**kwargs)
    Data.name = get_short_name()
    Data.summary = get_data_info()
    return Data


def updateKwArgsWithDefaults(kwargs):
    for key in Defaults:
        if key not in kwargs:
            kwargs[key] = Defaults[key]


if __name__ == '__main__':
    import bnpy.viz.BarsViz
    WData = WordsData.CreateToyDataFromMixModel(**Defaults)
    bnpy.viz.BarsViz.plotExampleBarsDocs(WData)
示例#13
0
def get_data(seed=8675309, nObsTotal=25000, **kwargs):
    words_dict = get_BoW(seed)
    Data = WordsData(**words_dict)
    Data.summary = get_data_info()
    return Data