Пример #1
0
	def GetVisualizations(self, server, dataset, model, attribute):
		visualizations = []
		if not self.IsExcludedDataset(dataset):
			with Corpus_DB() as corpus_db:
				rows = corpus_db.GetModels()
			models = frozenset(row['model_key'] for row in rows)
			if 'bow' in models and 'lda' in models:
				visualizations.append({
					'value' : 'TermTopicMatrix1',
					'name'  : 'Term-Topic Matrix 0.1'
				})
				visualizations.append({
					'value' : 'TermTopicMatrix2',
					'name'  : 'Term-Topic Matrix 0.2'
				})
				visualizations.append({
					'value' : 'TermTopicMatrix3',
					'name'  : 'Term-Topic Matrix 0.3'
				})
				visualizations.append({
					'value' : 'ScatterPlot1',
					'name'  : 'Scatter Plot 1.0'
				})
			if 'bow' in models and 'lda' in models and 'itm' in models:
				visualizations.append({
					'value' : 'GroupInBox',
					'name'  : 'Group-in-a-Box'
				})
			if model == 'default' and attribute == 'index':
				self.content.update({
					'AvailableVisualizations' : visualizations
				})
		return visualizations
Пример #2
0
def ExportCorpus(database_path, corpus_filename):
    database_filename = '{}/corpus.db'.format(database_path)
    print 'Exporting database [{}] to file [{}]'.format(
        database_filename, corpus_filename)

    with Corpus_DB(database_path) as corpusDB:
        corpusDB.ExportToFile(corpus_filename)
Пример #3
0
def index():
	with Corpus_DB() as corpus_db:
		with BOW_DB() as bow_db:
			with LDA_DB() as lda_db:
				handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db)
	response.delimiters = ('[[', ']]')
	return handler.GenerateResponse()
Пример #4
0
def ExportSpreadsheet(database_path, spreadsheet_filename, is_csv):
    database_filename = '{}/corpus.db'.format(database_path)
    print 'Exporting database [{}] to spreadsheet [{}]'.format(
        database_filename, spreadsheet_filename)

    with Corpus_DB(database_path) as corpusDB:
        corpusDB.ExportToSpreadsheet(spreadsheet_filename, is_csv=is_csv)
Пример #5
0
 def GetModels(self, dataset):
     if self.IsExcluded(dataset):
         return None
     folders = []
     with Corpus_DB() as corpus_db:
         folders += corpus_db.GetModels()
     folders = sorted(folders)
     return folders
Пример #6
0
def GroupInBox():
    with Corpus_DB() as corpus_db:
        with BOW_DB() as bow_db:
            with LDA_DB() as lda_db:
                handler = GroupInBoxHandler(request, response, corpus_db,
                                            bow_db, lda_db)
    handler.LoadGIB()
    return handler.GenerateResponse()
Пример #7
0
def Inspect():
    with Corpus_DB() as corpus_db:
        with BOW_DB() as bow_db:
            with LDA_DB() as lda_db:
                handler = GroupInBoxHandler(request, response, corpus_db,
                                            bow_db, lda_db)
    handler.InspectModel()
    return handler.GenerateResponse()
Пример #8
0
def ImportMalletLDA( app_name, model_path, corpus_path, database_path, is_quiet, force_overwrite ):
	logger = logging.getLogger( 'termite' )
	logger.addHandler( logging.StreamHandler() )
	logger.setLevel( logging.INFO if is_quiet else logging.DEBUG )
	
	app_path = 'apps/{}'.format( app_name )
	corpus_filename = '{}/corpus.txt'.format( corpus_path )
	database_filename = '{}/corpus.db'.format( database_path )
	logger.info( '--------------------------------------------------------------------------------' )
	logger.info( 'Import an ITM topic model as a web2py application...' )
	logger.info( '           app_name = %s', app_name )
	logger.info( '           app_path = %s', app_path )
	logger.info( '         model_path = %s', model_path )
	logger.info( '    corpus_filename = %s', corpus_filename )
	logger.info( '  database_filename = %s', database_filename )
	logger.info( '--------------------------------------------------------------------------------' )
	
	if force_overwrite or not os.path.exists( app_path ):
		with CreateApp(app_name) as app:
			# Create a copy of the original corpus
			app_database_filename = '{}/corpus.db'.format( app.GetDataPath() )
			logger.info( 'Copying [%s] --> [%s]', database_filename, app_database_filename )
			shutil.copy( database_filename, app_database_filename )
			
			# Import corpus (models/corpus.db, data/corpus.txt, data/sentences.txt)
			app_corpus_filename = '{}/corpus.txt'.format( app.GetDataPath() )
			logger.info( 'Copying [%s] --> [%s]', corpus_filename, app_corpus_filename )
			shutil.copy( corpus_filename, app_corpus_filename )
			app_sentences_filename = '{}/sentences.txt'.format( app.GetDataPath() )
			logger.info( 'Extracting [%s] --> [%s]', corpus_filename, app_sentences_filename )
			SplitSentences( corpus_filename, app_sentences_filename )
			app_db_filename = '{}/corpus.db'.format( app.GetDatabasePath() )
			logger.info( 'Copying [%s] --> [%s]', database_filename, app_db_filename )
			shutil.copy( database_filename, app_db_filename )
			
			# Compute derived-statistics about the corpus
			db_path = app.GetDatabasePath()
			with Corpus_DB(db_path, isInit=True) as corpus_db:
				computer = Corpus_ComputeStats( corpus_db, app_corpus_filename, app_sentences_filename )
				computer.Execute()
			
				# Import model
				app_model_path = '{}/treetm'.format( app.GetDataPath() )
				logger.info( 'Copying [%s] --> [%s]', model_path, app_model_path )
				shutil.copytree( model_path, app_model_path )
			
				# Compute derived-statistics about the model
				with LDA_DB(db_path, isInit=True) as lda_db:
					reader = TreeTMReader( lda_db, app_model_path )
					reader.Execute()
					computer = LDA_ComputeStats( lda_db, corpus_db )
					computer.Execute()
					with ITM_DB(db_path, isInit=True) as itm_db:
						computer = ITM_ComputeStats( itm_db, corpus_db )
						computer.Execute()
	else:
		logger.info( '    Already available: %s', app_path )
Пример #9
0
def ImportSTM( app_name, model_path, corpus_path, database_path, is_quiet, force_overwrite ):
	logger = logging.getLogger( 'termite' )
	logger.addHandler( logging.StreamHandler() )
	logger.setLevel( logging.INFO if is_quiet else logging.DEBUG )
	
	app_path = 'apps/{}'.format( app_name )
	corpus_filename = '{}/corpus.txt'.format( corpus_path )
	database_filename = '{}/corpus.db'.format( database_path )
	logger.info( '--------------------------------------------------------------------------------' )
	logger.info( 'Import an STM topic model as a web2py application...' )
	logger.info( '           app_name = %s', app_name )
	logger.info( '           app_path = %s', app_path )
	logger.info( '         model_path = %s', model_path )
	logger.info( '    corpus_filename = %s', corpus_filename )
	logger.info( '  database_filename = %s', database_filename )
	logger.info( '--------------------------------------------------------------------------------' )
	
	if force_overwrite or not os.path.exists( app_path ):
		with CreateApp(app_name) as app:
			# Import corpus (models/corpus.db, data/corpus.txt, data/sentences.txt)
			app_database_filename = '{}/corpus.db'.format( app.GetDatabasePath() )
			app_corpus_filename = '{}/corpus.txt'.format( app.GetDataPath() )
			app_sentences_filename = '{}/sentences.txt'.format( app.GetDataPath() )
			logger.info( 'Copying [%s] --> [%s]', database_filename, app_database_filename )
			shutil.copy( database_filename, app_database_filename )
			logger.info( 'Copying [%s] --> [%s]', corpus_filename, app_corpus_filename )
			shutil.copy( corpus_filename, app_corpus_filename )
			logger.info( 'Extracting [%s] --> [%s]', corpus_filename, app_sentences_filename )
			SplitSentences( corpus_filename, app_sentences_filename )
			
			# Import model (data/*)
			app_model_path = '{}/stm'.format( app.GetDataPath() )
			logger.info( 'Copying [%s] --> [%s]', model_path, app_model_path )
			shutil.copytree( model_path, app_model_path )
			for stm_filename in [ 'doc-index.json', 'term-index.json', 'topic-index.json', 'doc-topic-matrix.txt', 'term-topic-matrix.txt' ]:
				source_filename = '{}/{}'.format(corpus_path, stm_filename)
				target_filename = '{}/{}'.format(app_model_path, stm_filename)
				logger.info( 'Copying [%s] --> [%s]', source_filename, target_filename )
				shutil.copy( source_filename, target_filename )
			
			db_path = app.GetDatabasePath()
			with Corpus_DB(db_path) as corpus_db:
				
				# Create a bow-of-words language model
				with BOW_DB(db_path, isInit=True) as bow_db:
					bow_computer = BOW_ComputeStats(bow_db, corpus_db, app_corpus_filename, app_sentences_filename)
					bow_computer.Execute()
				
				# Compute derived-statistics about an LDA-like topic model
				with LDA_DB(db_path, isInit=True) as lda_db:
					stm_reader = STMReader(lda_db, app_model_path, corpus_db)
					stm_reader.Execute()
					lda_computer = LDA_ComputeStats(lda_db, corpus_db)
					lda_computer.Execute()			
	else:
		logger.info( '    Already available: %s', app_path )
Пример #10
0
def ImportCorpus(corpus_filename_or_folder, database_path):
	database_filename = '{}/corpus.db'.format(database_path)
	
	with Corpus_DB(database_path, isImport=True) as corpus_db:
		if os.path.isfile(corpus_filename_or_folder):
			print 'Importing file [{}] into database [{}]'.format(corpus_filename_or_folder, database_filename)
			corpus_db.ImportFromFile(corpus_filename_or_folder)
		else:
			print 'Importing folder [{}] into database [{}]'.format(corpus_filename_or_folder, database_filename)
			corpus_db.ImportFromFolder(corpus_filename_or_folder)
Пример #11
0
def gib():
	with Corpus_DB() as corpus_db:
		with BOW_DB() as bow_db:
			with LDA_DB() as lda_db:
				handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db)
	handler.UpdateModel()
	handler.InspectModel()
	handler.LoadGIB()
	dataStr = json.dumps(handler.content, encoding='utf-8', indent=2, sort_keys=True)
	response.headers['Content-Type'] = 'application/json'
	return dataStr
Пример #12
0
def ImportSpreadsheet(spreadsheet_filename, database_path, id_key, content_key,
                      is_csv):
    database_filename = '{}/corpus.db'.format(database_path)
    print 'Importing spreadsheet [{}] into database [{}]'.format(
        spreadsheet_filename, database_filename)

    with Corpus_DB(database_path, isINit=True) as corpus_db:
        corpus_db.ImportFromSpreadsheet(spreadsheet_filename,
                                        is_csv=is_csv,
                                        id_key=id_key,
                                        content_key=content_key)
Пример #13
0
def ExportSpreadsheet(database_path, spreadsheet_filename, id_key, content_key,
                      is_csv):
    database_filename = '{}/corpus.db'.format(database_path)
    print 'Exporting database [{}] to spreadsheet [{}]'.format(
        database_filename, spreadsheet_filename)

    with Corpus_DB(database_path) as corpus_db:
        corpus_db.ExportToSpreadsheet(spreadsheet_filename,
                                      is_csv=is_csv,
                                      id_key=id_key,
                                      content_key=content_key)
Пример #14
0
 def GetModels(self, server, dataset):
     models = []
     if not self.IsExcludedDataset(dataset):
         with Corpus_DB() as corpus_db:
             rows = corpus_db.GetModels()
         models = [{
             'value': row['model_key'],
             'name': row['model_desc']
         } for row in rows]
         self.content.update({'AvailableModels': models})
     return models
def main():
    parser = argparse.ArgumentParser(
        description='Import a STM topic model as a folder of files.')
    parser.add_argument('path',
                        type=str,
                        default='poliblog_1',
                        help='A folder containing file "stm.RData"')
    args = parser.parse_args()
    path = args.path

    logger = logging.getLogger('termite')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.DEBUG)

    with Corpus_DB('.') as corpus_db:
        with LDA_DB(path, isInit=True) as lda_db:
            reader = STMReader(lda_db,
                               path,
                               corpus_db,
                               r_variable="mod.out.replicate")
            reader.Execute()

    command = 'sqlite3 -separator "\t" {PATH}/lda.db "SELECT topic_index, term_text, value FROM term_topic_matrix INNER JOIN terms ON term_topic_matrix.term_index = terms.term_index ORDER BY topic_index ASC, value DESC" > {PATH}/topic-word-weights.txt'.format(
        PATH=path)
    logger.info(command)
    os.system(command)

    command = 'sqlite3 -separator "\t" {PATH}/lda.db "SELECT topic_index, SUM(value) FROM doc_topic_matrix GROUP BY topic_index ORDER BY topic_index" > {PATH}/topic-weights.txt'.format(
        PATH=path)
    logger.info(command)
    os.system(command)

    data = []
    max_value = 0
    filename = '{}/topic-weights.txt'.format(path)
    with open(filename, 'r') as f:
        for line in f.read().splitlines():
            topic_index, topic_weight = line.split('\t')
            topic_index = int(topic_index)
            topic_weight = float(topic_weight)
            max_value = max(topic_weight, max_value)
            data.append({
                "topic_index": topic_index,
                "topic_weight": topic_weight,
                "value": topic_weight
            })
    for elem in data:
        elem['value'] = elem['value'] / max_value

    filename = '{}/meta.json'.format(path)
    with open(filename, 'w') as f:
        json.dump(data, f, encoding='utf-8', indent=2, sort_keys=True)
Пример #16
0
 def ImportSpreadsheet(dataset_id,
                       spreadsheet_filename,
                       is_csv=False,
                       id_column='doc_id',
                       content_column='doc_content'):
     dataset_path = '{}/data/{}'.format(request.folder, dataset_id)
     if not os.path.exists(dataset_path):
         os.makedirs(dataset_path)
     with Corpus_DB(path=dataset_path, isImport=True) as corpus_db:
         corpus_db.ImportFromSpreadsheet(spreadsheet_filename,
                                         is_csv=is_csv,
                                         id_key=id_column,
                                         content_key=content_column)
Пример #17
0
def index():
	with Corpus_DB() as corpus_db:
		handler = Corpus_Core(request, response, corpus_db)
	return handler.GenerateResponse()
Пример #18
0
def Metadata():
	with Corpus_DB() as corpus_db:
		handler = Corpus_Core(request, response, corpus_db)
		handler.LoadMetadataFields()
	return handler.GenerateResponse()
Пример #19
0
def SearchDocuments():
	with Corpus_DB() as corpus_db:
		handler = Corpus_Core(request, response, corpus_db)
		handler.SearchDocuments()
	return handler.GenerateResponse()
Пример #20
0
def DocumentById():
	with Corpus_DB() as corpus_db:
		handler = Corpus_Core(request, response, corpus_db)
		handler.LoadDocumentById()
	return handler.GenerateResponse()
Пример #21
0
def gib():
    with Corpus_DB() as corpus_db:
        with LDA_DB() as lda_db:
            gib = ITM_GroupInBox(request, response, corpus_db, lda_db)
    gib.Load()
    return gib.GenerateResponse()
Пример #22
0
 def ImportPlaintext(dataset_id, plaintext_filename):
     dataset_path = '{}/data/{}'.format(request.folder, dataset_id)
     if not os.path.exists(dataset_path):
         os.makedirs(dataset_path)
     with Corpus_DB(path=dataset_path, isImport=True) as corpus_db:
         corpus_db.ImportFromFile(plaintext_filename)
Пример #23
0
def TermG2():
    with Corpus_DB() as corpus_db:
        handler = Corpus_Core(request, response, corpus_db)
        handler.LoadTermG2()
    return handler.GenerateResponse()
Пример #24
0
def SentenceCoProbs():
    with Corpus_DB() as corpus_db:
        handler = Corpus_Core(request, response, corpus_db)
        handler.LoadSentenceCoProbs()
    return handler.GenerateResponse()