def GetVisualizations(self, server, dataset, model, attribute): visualizations = [] if not self.IsExcludedDataset(dataset): with Corpus_DB() as corpus_db: rows = corpus_db.GetModels() models = frozenset(row['model_key'] for row in rows) if 'bow' in models and 'lda' in models: visualizations.append({ 'value' : 'TermTopicMatrix1', 'name' : 'Term-Topic Matrix 0.1' }) visualizations.append({ 'value' : 'TermTopicMatrix2', 'name' : 'Term-Topic Matrix 0.2' }) visualizations.append({ 'value' : 'TermTopicMatrix3', 'name' : 'Term-Topic Matrix 0.3' }) visualizations.append({ 'value' : 'ScatterPlot1', 'name' : 'Scatter Plot 1.0' }) if 'bow' in models and 'lda' in models and 'itm' in models: visualizations.append({ 'value' : 'GroupInBox', 'name' : 'Group-in-a-Box' }) if model == 'default' and attribute == 'index': self.content.update({ 'AvailableVisualizations' : visualizations }) return visualizations
def ExportCorpus(database_path, corpus_filename): database_filename = '{}/corpus.db'.format(database_path) print 'Exporting database [{}] to file [{}]'.format( database_filename, corpus_filename) with Corpus_DB(database_path) as corpusDB: corpusDB.ExportToFile(corpus_filename)
def index(): with Corpus_DB() as corpus_db: with BOW_DB() as bow_db: with LDA_DB() as lda_db: handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db) response.delimiters = ('[[', ']]') return handler.GenerateResponse()
def ExportSpreadsheet(database_path, spreadsheet_filename, is_csv): database_filename = '{}/corpus.db'.format(database_path) print 'Exporting database [{}] to spreadsheet [{}]'.format( database_filename, spreadsheet_filename) with Corpus_DB(database_path) as corpusDB: corpusDB.ExportToSpreadsheet(spreadsheet_filename, is_csv=is_csv)
def GetModels(self, dataset): if self.IsExcluded(dataset): return None folders = [] with Corpus_DB() as corpus_db: folders += corpus_db.GetModels() folders = sorted(folders) return folders
def GroupInBox(): with Corpus_DB() as corpus_db: with BOW_DB() as bow_db: with LDA_DB() as lda_db: handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db) handler.LoadGIB() return handler.GenerateResponse()
def Inspect(): with Corpus_DB() as corpus_db: with BOW_DB() as bow_db: with LDA_DB() as lda_db: handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db) handler.InspectModel() return handler.GenerateResponse()
def ImportMalletLDA( app_name, model_path, corpus_path, database_path, is_quiet, force_overwrite ): logger = logging.getLogger( 'termite' ) logger.addHandler( logging.StreamHandler() ) logger.setLevel( logging.INFO if is_quiet else logging.DEBUG ) app_path = 'apps/{}'.format( app_name ) corpus_filename = '{}/corpus.txt'.format( corpus_path ) database_filename = '{}/corpus.db'.format( database_path ) logger.info( '--------------------------------------------------------------------------------' ) logger.info( 'Import an ITM topic model as a web2py application...' ) logger.info( ' app_name = %s', app_name ) logger.info( ' app_path = %s', app_path ) logger.info( ' model_path = %s', model_path ) logger.info( ' corpus_filename = %s', corpus_filename ) logger.info( ' database_filename = %s', database_filename ) logger.info( '--------------------------------------------------------------------------------' ) if force_overwrite or not os.path.exists( app_path ): with CreateApp(app_name) as app: # Create a copy of the original corpus app_database_filename = '{}/corpus.db'.format( app.GetDataPath() ) logger.info( 'Copying [%s] --> [%s]', database_filename, app_database_filename ) shutil.copy( database_filename, app_database_filename ) # Import corpus (models/corpus.db, data/corpus.txt, data/sentences.txt) app_corpus_filename = '{}/corpus.txt'.format( app.GetDataPath() ) logger.info( 'Copying [%s] --> [%s]', corpus_filename, app_corpus_filename ) shutil.copy( corpus_filename, app_corpus_filename ) app_sentences_filename = '{}/sentences.txt'.format( app.GetDataPath() ) logger.info( 'Extracting [%s] --> [%s]', corpus_filename, app_sentences_filename ) SplitSentences( corpus_filename, app_sentences_filename ) app_db_filename = '{}/corpus.db'.format( app.GetDatabasePath() ) logger.info( 'Copying [%s] --> [%s]', database_filename, app_db_filename ) shutil.copy( database_filename, app_db_filename ) # Compute derived-statistics about the corpus db_path = app.GetDatabasePath() with Corpus_DB(db_path, isInit=True) as corpus_db: computer = Corpus_ComputeStats( corpus_db, app_corpus_filename, app_sentences_filename ) computer.Execute() # Import model app_model_path = '{}/treetm'.format( app.GetDataPath() ) logger.info( 'Copying [%s] --> [%s]', model_path, app_model_path ) shutil.copytree( model_path, app_model_path ) # Compute derived-statistics about the model with LDA_DB(db_path, isInit=True) as lda_db: reader = TreeTMReader( lda_db, app_model_path ) reader.Execute() computer = LDA_ComputeStats( lda_db, corpus_db ) computer.Execute() with ITM_DB(db_path, isInit=True) as itm_db: computer = ITM_ComputeStats( itm_db, corpus_db ) computer.Execute() else: logger.info( ' Already available: %s', app_path )
def ImportSTM( app_name, model_path, corpus_path, database_path, is_quiet, force_overwrite ): logger = logging.getLogger( 'termite' ) logger.addHandler( logging.StreamHandler() ) logger.setLevel( logging.INFO if is_quiet else logging.DEBUG ) app_path = 'apps/{}'.format( app_name ) corpus_filename = '{}/corpus.txt'.format( corpus_path ) database_filename = '{}/corpus.db'.format( database_path ) logger.info( '--------------------------------------------------------------------------------' ) logger.info( 'Import an STM topic model as a web2py application...' ) logger.info( ' app_name = %s', app_name ) logger.info( ' app_path = %s', app_path ) logger.info( ' model_path = %s', model_path ) logger.info( ' corpus_filename = %s', corpus_filename ) logger.info( ' database_filename = %s', database_filename ) logger.info( '--------------------------------------------------------------------------------' ) if force_overwrite or not os.path.exists( app_path ): with CreateApp(app_name) as app: # Import corpus (models/corpus.db, data/corpus.txt, data/sentences.txt) app_database_filename = '{}/corpus.db'.format( app.GetDatabasePath() ) app_corpus_filename = '{}/corpus.txt'.format( app.GetDataPath() ) app_sentences_filename = '{}/sentences.txt'.format( app.GetDataPath() ) logger.info( 'Copying [%s] --> [%s]', database_filename, app_database_filename ) shutil.copy( database_filename, app_database_filename ) logger.info( 'Copying [%s] --> [%s]', corpus_filename, app_corpus_filename ) shutil.copy( corpus_filename, app_corpus_filename ) logger.info( 'Extracting [%s] --> [%s]', corpus_filename, app_sentences_filename ) SplitSentences( corpus_filename, app_sentences_filename ) # Import model (data/*) app_model_path = '{}/stm'.format( app.GetDataPath() ) logger.info( 'Copying [%s] --> [%s]', model_path, app_model_path ) shutil.copytree( model_path, app_model_path ) for stm_filename in [ 'doc-index.json', 'term-index.json', 'topic-index.json', 'doc-topic-matrix.txt', 'term-topic-matrix.txt' ]: source_filename = '{}/{}'.format(corpus_path, stm_filename) target_filename = '{}/{}'.format(app_model_path, stm_filename) logger.info( 'Copying [%s] --> [%s]', source_filename, target_filename ) shutil.copy( source_filename, target_filename ) db_path = app.GetDatabasePath() with Corpus_DB(db_path) as corpus_db: # Create a bow-of-words language model with BOW_DB(db_path, isInit=True) as bow_db: bow_computer = BOW_ComputeStats(bow_db, corpus_db, app_corpus_filename, app_sentences_filename) bow_computer.Execute() # Compute derived-statistics about an LDA-like topic model with LDA_DB(db_path, isInit=True) as lda_db: stm_reader = STMReader(lda_db, app_model_path, corpus_db) stm_reader.Execute() lda_computer = LDA_ComputeStats(lda_db, corpus_db) lda_computer.Execute() else: logger.info( ' Already available: %s', app_path )
def ImportCorpus(corpus_filename_or_folder, database_path): database_filename = '{}/corpus.db'.format(database_path) with Corpus_DB(database_path, isImport=True) as corpus_db: if os.path.isfile(corpus_filename_or_folder): print 'Importing file [{}] into database [{}]'.format(corpus_filename_or_folder, database_filename) corpus_db.ImportFromFile(corpus_filename_or_folder) else: print 'Importing folder [{}] into database [{}]'.format(corpus_filename_or_folder, database_filename) corpus_db.ImportFromFolder(corpus_filename_or_folder)
def gib(): with Corpus_DB() as corpus_db: with BOW_DB() as bow_db: with LDA_DB() as lda_db: handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db) handler.UpdateModel() handler.InspectModel() handler.LoadGIB() dataStr = json.dumps(handler.content, encoding='utf-8', indent=2, sort_keys=True) response.headers['Content-Type'] = 'application/json' return dataStr
def ImportSpreadsheet(spreadsheet_filename, database_path, id_key, content_key, is_csv): database_filename = '{}/corpus.db'.format(database_path) print 'Importing spreadsheet [{}] into database [{}]'.format( spreadsheet_filename, database_filename) with Corpus_DB(database_path, isINit=True) as corpus_db: corpus_db.ImportFromSpreadsheet(spreadsheet_filename, is_csv=is_csv, id_key=id_key, content_key=content_key)
def ExportSpreadsheet(database_path, spreadsheet_filename, id_key, content_key, is_csv): database_filename = '{}/corpus.db'.format(database_path) print 'Exporting database [{}] to spreadsheet [{}]'.format( database_filename, spreadsheet_filename) with Corpus_DB(database_path) as corpus_db: corpus_db.ExportToSpreadsheet(spreadsheet_filename, is_csv=is_csv, id_key=id_key, content_key=content_key)
def GetModels(self, server, dataset): models = [] if not self.IsExcludedDataset(dataset): with Corpus_DB() as corpus_db: rows = corpus_db.GetModels() models = [{ 'value': row['model_key'], 'name': row['model_desc'] } for row in rows] self.content.update({'AvailableModels': models}) return models
def main(): parser = argparse.ArgumentParser( description='Import a STM topic model as a folder of files.') parser.add_argument('path', type=str, default='poliblog_1', help='A folder containing file "stm.RData"') args = parser.parse_args() path = args.path logger = logging.getLogger('termite') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) with Corpus_DB('.') as corpus_db: with LDA_DB(path, isInit=True) as lda_db: reader = STMReader(lda_db, path, corpus_db, r_variable="mod.out.replicate") reader.Execute() command = 'sqlite3 -separator "\t" {PATH}/lda.db "SELECT topic_index, term_text, value FROM term_topic_matrix INNER JOIN terms ON term_topic_matrix.term_index = terms.term_index ORDER BY topic_index ASC, value DESC" > {PATH}/topic-word-weights.txt'.format( PATH=path) logger.info(command) os.system(command) command = 'sqlite3 -separator "\t" {PATH}/lda.db "SELECT topic_index, SUM(value) FROM doc_topic_matrix GROUP BY topic_index ORDER BY topic_index" > {PATH}/topic-weights.txt'.format( PATH=path) logger.info(command) os.system(command) data = [] max_value = 0 filename = '{}/topic-weights.txt'.format(path) with open(filename, 'r') as f: for line in f.read().splitlines(): topic_index, topic_weight = line.split('\t') topic_index = int(topic_index) topic_weight = float(topic_weight) max_value = max(topic_weight, max_value) data.append({ "topic_index": topic_index, "topic_weight": topic_weight, "value": topic_weight }) for elem in data: elem['value'] = elem['value'] / max_value filename = '{}/meta.json'.format(path) with open(filename, 'w') as f: json.dump(data, f, encoding='utf-8', indent=2, sort_keys=True)
def ImportSpreadsheet(dataset_id, spreadsheet_filename, is_csv=False, id_column='doc_id', content_column='doc_content'): dataset_path = '{}/data/{}'.format(request.folder, dataset_id) if not os.path.exists(dataset_path): os.makedirs(dataset_path) with Corpus_DB(path=dataset_path, isImport=True) as corpus_db: corpus_db.ImportFromSpreadsheet(spreadsheet_filename, is_csv=is_csv, id_key=id_column, content_key=content_column)
def index(): with Corpus_DB() as corpus_db: handler = Corpus_Core(request, response, corpus_db) return handler.GenerateResponse()
def Metadata(): with Corpus_DB() as corpus_db: handler = Corpus_Core(request, response, corpus_db) handler.LoadMetadataFields() return handler.GenerateResponse()
def SearchDocuments(): with Corpus_DB() as corpus_db: handler = Corpus_Core(request, response, corpus_db) handler.SearchDocuments() return handler.GenerateResponse()
def DocumentById(): with Corpus_DB() as corpus_db: handler = Corpus_Core(request, response, corpus_db) handler.LoadDocumentById() return handler.GenerateResponse()
def gib(): with Corpus_DB() as corpus_db: with LDA_DB() as lda_db: gib = ITM_GroupInBox(request, response, corpus_db, lda_db) gib.Load() return gib.GenerateResponse()
def ImportPlaintext(dataset_id, plaintext_filename): dataset_path = '{}/data/{}'.format(request.folder, dataset_id) if not os.path.exists(dataset_path): os.makedirs(dataset_path) with Corpus_DB(path=dataset_path, isImport=True) as corpus_db: corpus_db.ImportFromFile(plaintext_filename)
def TermG2(): with Corpus_DB() as corpus_db: handler = Corpus_Core(request, response, corpus_db) handler.LoadTermG2() return handler.GenerateResponse()
def SentenceCoProbs(): with Corpus_DB() as corpus_db: handler = Corpus_Core(request, response, corpus_db) handler.LoadSentenceCoProbs() return handler.GenerateResponse()