Exemplo n.º 1
0
 def UpdateModel(self):
     app_path = self.request.folder
     app_model_path = '{}/data/treetm'.format(app_path)
     iterCount = self.GetIterCount(app_model_path)
     iters = self.GetIters(iterCount)
     mustLinks, cannotLinks, keepTerms, removeTerms = self.GetConstraints()
     action = self.GetAction()
     if action != 'train' or iters is None:
         self.content.update({
             'IterCount': iterCount,
             'MustLinks': mustLinks,
             'CannotLinks': cannotLinks,
             'KeepTerms': keepTerms,
             'RemoveTerms': removeTerms
         })
     else:
         RefineLDA(app_model_path,
                   numIters=iters,
                   mustLinks=mustLinks,
                   cannotLinks=cannotLinks,
                   keepTerms=keepTerms,
                   removeTerms=removeTerms)
         with LDA_DB(isReset=True) as lda_db:
             reader = TreeTMReader(lda_db, app_model_path)
             reader.Execute()
             computer = LDA_ComputeStats(lda_db)
             computer.Execute()
         self.content.update({
             'IterCount': iterCount,
             'MustLinks': mustLinks,
             'CannotLinks': cannotLinks,
             'KeepTerms': keepTerms,
             'RemoveTerms': removeTerms
         })
Exemplo n.º 2
0
def index():
	with Corpus_DB() as corpus_db:
		with BOW_DB() as bow_db:
			with LDA_DB() as lda_db:
				handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db)
	response.delimiters = ('[[', ']]')
	return handler.GenerateResponse()
Exemplo n.º 3
0
def TermFrequencyModel():
	with BOW_DB() as bow_db:
		with LDA_DB() as lda_db:
			handler = TermTopicMatrix1(request, response, bow_db, lda_db)
			data = handler.GetTermFrequencyModel()
	dataStr = json.dumps(data, encoding='utf-8', indent=2, sort_keys=True)
	response.headers['Content-Type'] = 'application/json'
	return dataStr
Exemplo n.º 4
0
def GroupInBox():
    with Corpus_DB() as corpus_db:
        with BOW_DB() as bow_db:
            with LDA_DB() as lda_db:
                handler = GroupInBoxHandler(request, response, corpus_db,
                                            bow_db, lda_db)
    handler.LoadGIB()
    return handler.GenerateResponse()
Exemplo n.º 5
0
def Inspect():
    with Corpus_DB() as corpus_db:
        with BOW_DB() as bow_db:
            with LDA_DB() as lda_db:
                handler = GroupInBoxHandler(request, response, corpus_db,
                                            bow_db, lda_db)
    handler.InspectModel()
    return handler.GenerateResponse()
Exemplo n.º 6
0
def ImportMalletLDA( app_name, model_path, corpus_path, database_path, is_quiet, force_overwrite ):
	logger = logging.getLogger( 'termite' )
	logger.addHandler( logging.StreamHandler() )
	logger.setLevel( logging.INFO if is_quiet else logging.DEBUG )
	
	app_path = 'apps/{}'.format( app_name )
	corpus_filename = '{}/corpus.txt'.format( corpus_path )
	database_filename = '{}/corpus.db'.format( database_path )
	logger.info( '--------------------------------------------------------------------------------' )
	logger.info( 'Import an ITM topic model as a web2py application...' )
	logger.info( '           app_name = %s', app_name )
	logger.info( '           app_path = %s', app_path )
	logger.info( '         model_path = %s', model_path )
	logger.info( '    corpus_filename = %s', corpus_filename )
	logger.info( '  database_filename = %s', database_filename )
	logger.info( '--------------------------------------------------------------------------------' )
	
	if force_overwrite or not os.path.exists( app_path ):
		with CreateApp(app_name) as app:
			# Create a copy of the original corpus
			app_database_filename = '{}/corpus.db'.format( app.GetDataPath() )
			logger.info( 'Copying [%s] --> [%s]', database_filename, app_database_filename )
			shutil.copy( database_filename, app_database_filename )
			
			# Import corpus (models/corpus.db, data/corpus.txt, data/sentences.txt)
			app_corpus_filename = '{}/corpus.txt'.format( app.GetDataPath() )
			logger.info( 'Copying [%s] --> [%s]', corpus_filename, app_corpus_filename )
			shutil.copy( corpus_filename, app_corpus_filename )
			app_sentences_filename = '{}/sentences.txt'.format( app.GetDataPath() )
			logger.info( 'Extracting [%s] --> [%s]', corpus_filename, app_sentences_filename )
			SplitSentences( corpus_filename, app_sentences_filename )
			app_db_filename = '{}/corpus.db'.format( app.GetDatabasePath() )
			logger.info( 'Copying [%s] --> [%s]', database_filename, app_db_filename )
			shutil.copy( database_filename, app_db_filename )
			
			# Compute derived-statistics about the corpus
			db_path = app.GetDatabasePath()
			with Corpus_DB(db_path, isInit=True) as corpus_db:
				computer = Corpus_ComputeStats( corpus_db, app_corpus_filename, app_sentences_filename )
				computer.Execute()
			
				# Import model
				app_model_path = '{}/treetm'.format( app.GetDataPath() )
				logger.info( 'Copying [%s] --> [%s]', model_path, app_model_path )
				shutil.copytree( model_path, app_model_path )
			
				# Compute derived-statistics about the model
				with LDA_DB(db_path, isInit=True) as lda_db:
					reader = TreeTMReader( lda_db, app_model_path )
					reader.Execute()
					computer = LDA_ComputeStats( lda_db, corpus_db )
					computer.Execute()
					with ITM_DB(db_path, isInit=True) as itm_db:
						computer = ITM_ComputeStats( itm_db, corpus_db )
						computer.Execute()
	else:
		logger.info( '    Already available: %s', app_path )
Exemplo n.º 7
0
def ImportSTM( app_name, model_path, corpus_path, database_path, is_quiet, force_overwrite ):
	logger = logging.getLogger( 'termite' )
	logger.addHandler( logging.StreamHandler() )
	logger.setLevel( logging.INFO if is_quiet else logging.DEBUG )
	
	app_path = 'apps/{}'.format( app_name )
	corpus_filename = '{}/corpus.txt'.format( corpus_path )
	database_filename = '{}/corpus.db'.format( database_path )
	logger.info( '--------------------------------------------------------------------------------' )
	logger.info( 'Import an STM topic model as a web2py application...' )
	logger.info( '           app_name = %s', app_name )
	logger.info( '           app_path = %s', app_path )
	logger.info( '         model_path = %s', model_path )
	logger.info( '    corpus_filename = %s', corpus_filename )
	logger.info( '  database_filename = %s', database_filename )
	logger.info( '--------------------------------------------------------------------------------' )
	
	if force_overwrite or not os.path.exists( app_path ):
		with CreateApp(app_name) as app:
			# Import corpus (models/corpus.db, data/corpus.txt, data/sentences.txt)
			app_database_filename = '{}/corpus.db'.format( app.GetDatabasePath() )
			app_corpus_filename = '{}/corpus.txt'.format( app.GetDataPath() )
			app_sentences_filename = '{}/sentences.txt'.format( app.GetDataPath() )
			logger.info( 'Copying [%s] --> [%s]', database_filename, app_database_filename )
			shutil.copy( database_filename, app_database_filename )
			logger.info( 'Copying [%s] --> [%s]', corpus_filename, app_corpus_filename )
			shutil.copy( corpus_filename, app_corpus_filename )
			logger.info( 'Extracting [%s] --> [%s]', corpus_filename, app_sentences_filename )
			SplitSentences( corpus_filename, app_sentences_filename )
			
			# Import model (data/*)
			app_model_path = '{}/stm'.format( app.GetDataPath() )
			logger.info( 'Copying [%s] --> [%s]', model_path, app_model_path )
			shutil.copytree( model_path, app_model_path )
			for stm_filename in [ 'doc-index.json', 'term-index.json', 'topic-index.json', 'doc-topic-matrix.txt', 'term-topic-matrix.txt' ]:
				source_filename = '{}/{}'.format(corpus_path, stm_filename)
				target_filename = '{}/{}'.format(app_model_path, stm_filename)
				logger.info( 'Copying [%s] --> [%s]', source_filename, target_filename )
				shutil.copy( source_filename, target_filename )
			
			db_path = app.GetDatabasePath()
			with Corpus_DB(db_path) as corpus_db:
				
				# Create a bow-of-words language model
				with BOW_DB(db_path, isInit=True) as bow_db:
					bow_computer = BOW_ComputeStats(bow_db, corpus_db, app_corpus_filename, app_sentences_filename)
					bow_computer.Execute()
				
				# Compute derived-statistics about an LDA-like topic model
				with LDA_DB(db_path, isInit=True) as lda_db:
					stm_reader = STMReader(lda_db, app_model_path, corpus_db)
					stm_reader.Execute()
					lda_computer = LDA_ComputeStats(lda_db, corpus_db)
					lda_computer.Execute()			
	else:
		logger.info( '    Already available: %s', app_path )
Exemplo n.º 8
0
def gib():
	with Corpus_DB() as corpus_db:
		with BOW_DB() as bow_db:
			with LDA_DB() as lda_db:
				handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db)
	handler.UpdateModel()
	handler.InspectModel()
	handler.LoadGIB()
	dataStr = json.dumps(handler.content, encoding='utf-8', indent=2, sort_keys=True)
	response.headers['Content-Type'] = 'application/json'
	return dataStr
def main():
    parser = argparse.ArgumentParser(
        description='Import a STM topic model as a folder of files.')
    parser.add_argument('path',
                        type=str,
                        default='poliblog_1',
                        help='A folder containing file "stm.RData"')
    args = parser.parse_args()
    path = args.path

    logger = logging.getLogger('termite')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.DEBUG)

    with Corpus_DB('.') as corpus_db:
        with LDA_DB(path, isInit=True) as lda_db:
            reader = STMReader(lda_db,
                               path,
                               corpus_db,
                               r_variable="mod.out.replicate")
            reader.Execute()

    command = 'sqlite3 -separator "\t" {PATH}/lda.db "SELECT topic_index, term_text, value FROM term_topic_matrix INNER JOIN terms ON term_topic_matrix.term_index = terms.term_index ORDER BY topic_index ASC, value DESC" > {PATH}/topic-word-weights.txt'.format(
        PATH=path)
    logger.info(command)
    os.system(command)

    command = 'sqlite3 -separator "\t" {PATH}/lda.db "SELECT topic_index, SUM(value) FROM doc_topic_matrix GROUP BY topic_index ORDER BY topic_index" > {PATH}/topic-weights.txt'.format(
        PATH=path)
    logger.info(command)
    os.system(command)

    data = []
    max_value = 0
    filename = '{}/topic-weights.txt'.format(path)
    with open(filename, 'r') as f:
        for line in f.read().splitlines():
            topic_index, topic_weight = line.split('\t')
            topic_index = int(topic_index)
            topic_weight = float(topic_weight)
            max_value = max(topic_weight, max_value)
            data.append({
                "topic_index": topic_index,
                "topic_weight": topic_weight,
                "value": topic_weight
            })
    for elem in data:
        elem['value'] = elem['value'] / max_value

    filename = '{}/meta.json'.format(path)
    with open(filename, 'w') as f:
        json.dump(data, f, encoding='utf-8', indent=2, sort_keys=True)
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(
        description='Import a gensim topic model as a folder of files.')
    parser.add_argument(
        'path',
        type=str,
        default='model_001',
        help='A folder containing files "gensim.dict" and "gensim.model"')
    args = parser.parse_args()
    path = args.path

    logger = logging.getLogger('termite')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.DEBUG)

    with LDA_DB(path, isInit=True) as lda_db:
        reader = GensimReader(lda_db, path, None, extraStateFile=True)
        reader.Execute()

    command = 'sqlite3 -separator "\t" {PATH}/lda.db "SELECT topic_index, term_text, value FROM term_topic_matrix INNER JOIN terms ON term_topic_matrix.term_index = terms.term_index ORDER BY topic_index ASC, value DESC" > {PATH}/topic-word-weights.txt'.format(
        PATH=path)
    logger.info(command)
    os.system(command)
Exemplo n.º 11
0
def index():
	with BOW_DB() as bow_db:
		with LDA_DB() as lda_db:
			handler = TermTopicMatrix1(request, response, bow_db, lda_db)
	return handler.GenerateResponse()
Exemplo n.º 12
0
def index():
    with LDA_DB() as lda_db:
        handler = ITM_Core(request, response, lda_db)
    return handler.GenerateResponse()
Exemplo n.º 13
0
def gib():
    with Corpus_DB() as corpus_db:
        with LDA_DB() as lda_db:
            gib = ITM_GroupInBox(request, response, corpus_db, lda_db)
    gib.Load()
    return gib.GenerateResponse()
Exemplo n.º 14
0
def Update():
    with LDA_DB() as lda_db:
        handler = ITM_Core(request, response, lda_db)
        handler.UpdateModel()
    return handler.GenerateResponse()
Exemplo n.º 15
0
def TopDocs():
    with LDA_DB() as lda_db:
        handler = LDA_Core(request, response, lda_db)
        handler.LoadTopDocs()
    return handler.GenerateResponse()
Exemplo n.º 16
0
def DocTopicMatrix():
    with LDA_DB() as lda_db:
        handler = LDA_Core(request, response, lda_db)
        handler.LoadDocTopicMatrix()
    return handler.GenerateResponse()