def create_topicdoc_group_matrix(self, group_field='doc_label'): """Create topicdoc group matrix table""" # Get source doc table corpus_db_file = self.config.generate_corpus_db_file_path() corpus = PoloDb(corpus_db_file) src_docs = corpus.get_table('doc') if group_field not in src_docs.columns: raise ValueError('Column `{}` does not exist on corpus doc table.'.format(group_field)) src_docs.rename(columns={'doc_id':'src_doc_id'}, inplace=True) del corpus # Add the model doc_id to src_doc docs = self.get_table('doc') src_docs = src_docs.merge(docs[['doc_id', 'src_doc_id']], on='src_doc_id', how='right') src_docs.set_index('doc_id', inplace=True) # Change index to align with doctopics del docs # Get doctopic table doctopics = self.get_table('doctopic', set_index=True) dtw = doctopics['topic_weight'].unstack() del doctopics dtw['doc_group'] = src_docs[group_field] dtg = dtw.groupby('doc_group') dtm = dtg.mean().fillna(0) if dtm.columns.nlevels == 2: dtm.columns = dtm.columns.droplevel(0) self.put_table(dtm, 'topic{}_matrix'.format(group_field), index=True) dtm_counts = dtg[0].count().fillna(0) dtm_counts.name = 'doc_count' self.put_table(dtm_counts, 'topic{}_matrix_counts'.format(group_field), index=True)
def add_topiccompcorr(self): """Add topic component correlation table""" corpus_db_file = self.config.generate_corpus_db_file_path() corpus = PoloDb(corpus_db_file) pca_doc = corpus.get_table('pca_doc') del(corpus) pca_doc = pca_doc.set_index('doc_id') sql = """ SELECT a.src_doc_id AS doc_id, topic_id, topic_weight FROM doc a JOIN doctopic b USING(doc_id) """ doctopic = pd.read_sql_query(sql, self.conn, index_col=['doc_id', 'topic_id']) dtm = doctopic.unstack() dtm.columns = dtm.columns.droplevel(0) # dtm.columns = ["T{0}".format(col) for col in dtm.columns] X = dtm.T.dot(pca_doc) self.put_table(X, 'topiccomp_corr', index=True) # Add topic poles A = X.idxmax() B = X.idxmin() C = pd.concat([A,B], 1) C.columns = ['max_pos_topic_id','max_neg_topic_id'] C.index = [int(idx.replace('PC','')) for idx in C.index] C.index.name = 'pc_id' self.put_table(C, 'topiccomp_pole', index=True)
def __init__(self, config, trial='trial1'): """Initialize MALLET with trial name""" if trial not in config.trials: raise ValueError("Invalid trail name `{}`.format(trial)") self.config = config self.trial = trial self.config.set_config_attributes(self) # Prefixes keys with cfg_ self.config.set_config_attributes(self, self.trial) # todo: Put this in config.ini self.cfg_tw_quantile = 0.8 # Temporary hack to handle casting for key in "num_topics num_iterations optimize_interval num_threads num_top_words".split(): att = 'cfg_{}'.format(key) setattr(self, att, int(getattr(self, att))) self.cfg_thresh = float(self.cfg_thresh) # Get replacment files # todo: Fix order; higher ngrams should go first ... argues for sortable names self.replacement_files = self.cfg_replacements for filename in os.listdir('corpus'): if 'replacements_' in filename: self.replacement_files += ' corpus/' + filename self.trial_name = self.trial # HACK self.file_prefix = '{}/{}'.format(self.cfg_mallet_out_dir, self.trial_name) self.mallet = {'import-file': {}, 'train-topics': {}} self.mallet_init() dbfile = self.config.generate_model_db_file_path(self.trial) PoloDb.__init__(self, dbfile)
def __init__(self, config, trial_name='trial1'): # Set some values if trial_name not in config.trials: raise ValueError("Invalid trail name `{}`.format(trial)") self.config = config self.trial = trial_name self.slug = self.config.ini['DEFAULT']['slug'] self.base_path = self.config.ini['DEFAULT']['base_path'] self.thresh = float(self.config.ini['DEFAULT']['thresh']) # Load the databases corpus_db_file = self.config.generate_corpus_db_file_path() model_db_file = self.config.generate_model_db_file_path(self.trial) self.corpus = PoloDb(corpus_db_file) self.model = PoloDb(model_db_file)
def create_topicdoc_col_matrix(self, group_col): """Create topicdoc matrix table for a group column""" # Get source doc table corpus_db_file = self.config.generate_corpus_db_file_path() corpus = PoloDb(corpus_db_file) src_docs = corpus.get_table('doc') src_docs.rename(columns={'doc_id':'src_doc_id'}, inplace=True) del corpus # Add the model doc_id to src_doc docs = self.get_table('doc') src_docs = src_docs.merge(docs[['doc_id', 'src_doc_id']], on='src_doc_id', how='right') src_docs.set_index('doc_id', inplace=True) # Change index to align with doctopics del docs # Get doctopic table # thresh = self.get_thresh() # doctopics = pd.read_sql_query('SELECT * \ # FROM doctopic WHERE topic_weight >= ?', self.conn, params=(thresh,)) # doctopics.set_index(['doc_id', 'topic_id'], inplace=True) doctopics = self.get_table('doctopic', set_index=True) dtw = doctopics['topic_weight'].unstack() del doctopics # todo: Streamline the logic here if group_col == 'ord': doc_col = self.config.ini['DEFAULT']['src_ord_col'] elif group_col == 'label': doc_col = 'doc_label' else: group_col = 'ord' doc_col = self.config.ini['DEFAULT']['src_ord_col'] dtw['doc_group'] = src_docs[doc_col] dtg = dtw.groupby('doc_group') dtm = dtg.mean().fillna(0) if dtm.columns.nlevels == 2: dtm.columns = dtm.columns.droplevel(0) self.put_table(dtm, 'topicdoc{}_matrix'.format(group_col), index=True) dtm_counts = dtg[0].count().fillna(0) dtm_counts.name = 'doc_count' self.put_table(dtm_counts, 'topicdoc{}_matrix_counts'.format(group_col), index=True)
def retro_combine(self, corpus_dbfile, model_dbfile, retro_dbfile=None): self.corpus = PoloDb(corpus_dbfile) self.model = PoloDb(model_dbfile) if retro_dbfile == None: retro_dbfile = '{}-retro-combo.db'.format( self.config.ini['DEFAULT']['slug']) self.retro = PoloDb(retro_dbfile) self.create_retro_db()
def __init__(self, config): """Initialize corpus object""" # Import Configs self.config = config self.config.set_config_attributes(self) if not os.path.isfile(self.cfg_src_file_name): raise ValueError( "Missing source file. Check value of `src_file_name` in INI file." ) self.dbfile = config.generate_corpus_db_file_path() PoloDb.__init__(self, self.dbfile) # self.db = PoloDb(self.dbfile) # Why not do this? if self.cfg_nltk_data_path: nltk.data.path.append(self.cfg_nltk_data_path) # For tokenizing into sentences # fixme: TOKENIZER ASSUMES ENGLISH -- PARAMETIZE THIS nltk.download('punkt') nltk.download('tagsets') nltk.download('averaged_perceptron_tagger') self.tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
class PoloGensim: def __init__(self, conifg, corpus_dbfile=None): self.gs_corpus = None self.gs_dict = None self.db = PoloDb(corpus_dbfile) def make_gs_corpus(self): doctokenbow = self.db.get_table('doctokenbow') doctokenbow.set_index('doc_id', inplace=True) self.gs_corpus = [[ (row[0], row[1]) for row in doctokenbow.loc[doc_id == doc_id, ['token_id', 'token_count']].values ] for doc_id in doctokenbow.index.unique()] """ # Old school loop way self.gs_corpus = [] for doc_id in doctokenbow.index.unique(): doc = [] for row in doctokenbow.loc[doc_id, ['token_id', 'token_count']].values: doc.append((row[0], row[1])) self.gs_corpus.append(doc) """ def make_gs_dict(self): token = self.db.get_table('token') self.gs_dict = { row[0]: row[1] for row in token[['token_id', 'token_str']].values } def get_hdp(self): hdp = models.HdpModel(self.gs_corpus, self.gs_dict) hdp_topics = hdp.get_topics() hdp_df = pd.DataFrame(hdp_topics) hdp_dfn = pd.DataFrame(hdp_df.unstack()) hdp_dfn.reset_index(inplace=True) hdp_dfn.columns = ['token_id', 'topic_id', 'token_freq'] self.db.put_table(hdp_dfn, 'hdp', if_exists='replace') # todo: Go the next step and extract topic with word with freqs above a thresh thresh = 0.0005 # Sometimes it's easier to use SQL than to figure out how to something # like this in Pandas sql = """ SELECT topic_id, GROUP_CONCAT(token_str, ' ') AS top_words FROM ( SELECT topic_id, token_id FROM hdp WHERE token_freq > {} ORDER BY topic_id, token_freq DESC ) JOIN token USING (token_id) GROUP BY topic_id """.format(thresh) hdp_topics = pd.read_sql_query(sql, self.db.conn) self.db.put_table(hdp_topics, 'hdp_topics') thresh = 0.005 # Note this is different from what's in config.ini
def get_model_db(slug, trial): pcfg = get_project_config(slug) model_db_file = pcfg.generate_model_db_file_path(trial) model = PoloDb(model_db_file) return model
def get_corpus_db(slug): pcfg = get_project_config(slug) corpus_db_file = pcfg.generate_corpus_db_file_path() corpus = PoloDb(corpus_db_file) return corpus
def __init__(self, config): corpus_db_file = self.config.generate_corpus_db_file_path() self.corpus = PoloDb(corpus_db_file)
class Elements(object): def __init__(self, config, trial_name='trial1'): # Set some values if trial_name not in config.trials: raise ValueError("Invalid trail name `{}`.format(trial)") self.config = config self.trial = trial_name self.slug = self.config.ini['DEFAULT']['slug'] self.base_path = self.config.ini['DEFAULT']['base_path'] self.thresh = float(self.config.ini['DEFAULT']['thresh']) # Load the databases corpus_db_file = self.config.generate_corpus_db_file_path() model_db_file = self.config.generate_model_db_file_path(self.trial) self.corpus = PoloDb(corpus_db_file) self.model = PoloDb(model_db_file) def get_table(self, table_name, db_conn): df = self.get_sql("SELECT * FROM {}".format(table_name), db_conn) return df def get_sql(self, query, db_conn, params=()): try: df = pd.read_sql_query(query, db_conn, params=params) return df except: return None def get_doc_count(self): self.doc_count = pd.read_sql_query('SELECT count(*) AS n FROM doc', self.corpus.conn).n.tolist()[0] return self.doc_count def get_topic_count(self): self.topic_count = pd.read_sql_query('SELECT count(*) AS n FROM topic', self.model.conn).n.tolist()[0] return self.topic_count def get_topic(self, topic_id): topic_id = int(topic_id) sql = 'SELECT * FROM topic WHERE topic_id = ?' df = pd.read_sql_query(sql, self.model.conn, params=(topic_id, )) df.set_index('topic_id', inplace=True) df['topic_phrases'] = self.get_topic_phrases(topic_id) return df def get_topics(self): topics = self.model.get_table('topic', set_index=True) topics['topic_alpha_zsign'] = topics.topic_alpha_zscore.apply( lambda x: 'pos' if x > 0 else 'neg') alpha_max = topics.topic_alpha.max() topics['topic_alpha_percent'] = ((topics.topic_alpha / alpha_max) * 100).astype(int) topic_phrases = self.model.get_table('topicphrase') topics['topic_phrases'] = topic_phrases.groupby('topic_id').apply( lambda x: ', '.join(x.topic_phrase)) return topics def get_top_bigrams(self, limit=50): limit = int(limit) sql = "SELECT ngram, ngram_count, score FROM ngrambi " \ "ORDER BY score DESC LIMIT {}".format(limit) df = pd.read_sql_query(sql, self.corpus.conn) df['ngram_percent'] = (df.score / df.score.max() * 100).astype('int') return df def get_topic_phrases(self, topic_id): topic_id = int(topic_id) sql = "SELECT topic_phrase FROM topicphrase " \ "WHERE topic_id = ? ORDER BY phrase_weight DESC" phrases = ', '.join( pd.read_sql_query(sql, self.model.conn, params=(topic_id, )).topic_phrase.tolist()) return phrases def get_all_topic_phrases(self, limit=20): sql = """ select topic_phrase, sum(phrase_count) as n, avg(phrase_weight) as w, group_concat(topic_id, ',') as topics from topicphrase group by topic_phrase order by n desc limit ? """ return pd.read_sql_query(sql, self.model.conn, params=(limit, )) def get_topic_entropy_hist(self): doctopics = self.model.get_table('doctopic', set_index=True) doctopics.unstack() # fixme: Deprecated function def get_topicdoclabel_matrix(self, sort_by_alpha=True): dtm = self.model.get_table('topicdoclabel_matrix', set_index=False) col1 = dtm.columns.tolist()[0] dtm.set_index(col1, inplace=True) topics = self.model.get_table('topic', set_index=True) if sort_by_alpha: topics = topics.sort_values('topic_alpha', ascending=True) dtm = dtm[topics.index.astype('str').tolist()] if 'topic_gloss' in topics.columns: dtm.columns = topics.reset_index().apply( lambda x: 'T{} {}'.format(x.topic_id, x.topic_gloss), axis=1) else: dtm.columns = topics.reset_index().apply( lambda x: 'T{} {}'.format(x.topic_id, x.topic_words), axis=1) return dtm def get_topicdoc_group_matrix(self, sort_by_alpha=True, group_field='doc_label', use_gloss_label=False): dtm = self.model.get_table( 'topic{}_matrix'.format(group_field), set_index=False) # todo: Should be schema driven col1 = dtm.columns.tolist()[ 0] # todo: Should always be doc_group; should be schema driven dtm.set_index(col1, inplace=True) topics = self.model.get_table('topic', set_index=True) if sort_by_alpha: topics = topics.sort_values('topic_alpha', ascending=True) dtm = dtm[topics.index.astype('str').tolist()] if use_gloss_label: if 'topic_gloss' in topics.columns: dtm.columns = topics.reset_index().apply( lambda x: 'T{} {}'.format(x.topic_id, x.topic_gloss), axis=1) else: dtm.columns = topics.reset_index().apply( lambda x: 'T{} {}'.format(x.topic_id, x.topic_words), axis=1) dtm.index = dtm.index.map( str) # This is to prevent templates from crashing return dtm def get_topicdocgrooup_counts(self, table_name): doc_counts = pd.DataFrame(self.model.get_table(table_name)) doc_counts.set_index('doc_group', inplace=True) return doc_counts def get_topicdoc_sum_matrix(self, dtm, group_counts): df = dtm.apply(lambda x: x * group_counts.doc_count.values, axis=0) return df def get_topicdoc_ord_for_topic(self, topic_id): topic_id = int(topic_id) #doc_col = self.config.ini['DEFAULT']['src_ord_col'] src_ord_col = self.config.ini['DEFAULT']['src_ord_col'] table_name = 'topic{}_matrix'.format(src_ord_col) sql = "SELECT doc_group, `{1}` as topic_weight FROM {0} ORDER BY doc_group".format( table_name, topic_id) df = pd.read_sql_query(sql, self.model.conn) return df def get_doc(self, src_doc_id): sql1 = "SELECT * FROM doc WHERE src_doc_id = ?" df = pd.read_sql_query(sql1, self.model.conn, params=(src_doc_id, )) df.set_index('src_doc_id', inplace=True) sql2 = "SELECT * FROM doc WHERE doc_id = ?" df2 = pd.read_sql_query(sql2, self.corpus.conn, params=(src_doc_id, )) df2.set_index('doc_id', inplace=True) #df2.index.name = 'src_doc_id' # todo: Fix this madness df = df.join(df2, lsuffix='_SRC') return df def get_doc_id_for_src_doc_id(self, src_doc_id): sql = "SELECT doc_id FROM doc WHERE src_doc_id = ?" df = pd.read_sql_query(sql, self.model.conn, params=(src_doc_id, )) doc_id = df.doc_id.tolist()[0] return doc_id def get_topics_for_doc_id(self, doc_id): sql = "SELECT * FROM doctopic WHERE doc_id = ?" df = pd.read_sql_query(sql, self.model.conn, params=(doc_id, )) return df def get_docs_for_topic(self, topic_id, limit=10): sql = "SELECT src_doc_id, topic_weight, topic_weight_zscore FROM doctopic " \ "JOIN doc USING(doc_id) WHERE topic_id = ? " \ "ORDER BY topic_weight DESC LIMIT {}".format(limit) df = pd.read_sql_query(sql, self.model.conn, params=(topic_id, )) df.set_index('src_doc_id', inplace=True) doc_ids = ','.join(df.index.astype('str').tolist()) sql2 = "SELECT doc_id, doc_label, doc_title, " \ "doc_content as doc_original, doc_content, doc_key " \ "FROM doc WHERE doc_id IN ({})".format(doc_ids) df2 = pd.read_sql_query( sql2, self.corpus.conn, ) df2.set_index('doc_id', inplace=True) df = df.join(df2) return df def get_docs_for_topic_and_label(self, topic_id, doc_col_value, doc_col=None, limit=100): if not doc_col: doc_col = self.config.ini['DEFAULT'][ 'src_ord_col'] # Should wrap these calls with a method df = pd.read_sql_query( "SELECT doc_id, doc_title, doc_content, doc_key FROM doc WHERE {} = ? LIMIT {}" .format(doc_col, limit), self.corpus.conn, params=(doc_col_value, )) df.set_index('doc_id', inplace=True) doc_ids = ','.join(df.index.astype('str').tolist()) sql2 = "SELECT d.*, topic_weight FROM doc d " \ "JOIN doctopic dt USING(doc_id) " \ "WHERE src_doc_id IN ({}) AND topic_id = ? " \ "ORDER BY topic_weight DESC LIMIT 10 ".format(doc_ids) df2 = pd.read_sql_query(sql2, self.model.conn, params=(topic_id, )) df2.set_index('src_doc_id', inplace=True) df = df.join(df2) return df.sort_values('topic_weight', ascending=False) def get_docs_for_group(self, group_field_value, group_field='doc_label'): sql1 = "SELECT * FROM doc WHERE {} = ? LIMIT 500".format(group_field) df = pd.read_sql_query(sql1, self.corpus.conn, params=(group_field_value, )) df.set_index('doc_id', inplace=True) src_doc_ids = ','.join(df.index.astype('str').tolist()) sql2 = "SELECT * FROM doc WHERE src_doc_id IN ({})".format(src_doc_ids) df2 = pd.read_sql_query(sql2, self.model.conn) df2.set_index('src_doc_id', inplace=True) df = df.join(df2, rsuffix='_SRC') return df def get_docs_for_topic_entropy(self, topic_entropy, limit=100): topic_entropy_min = float(topic_entropy) - .05 topic_entropy_max = float(topic_entropy) + .05 sql = "SELECT src_doc_id, topic_entropy, topic_entropy_zscore FROM doc " \ "WHERE topic_entropy >= ? AND topic_entropy < ? " \ "ORDER BY src_doc_id LIMIT {} ".format(limit) df = pd.read_sql_query(sql, self.model.conn, params=(topic_entropy_min, topic_entropy_max)) df.set_index('src_doc_id', inplace=True) doc_ids = ','.join(df.index.astype('str').tolist()) sql2 = "SELECT doc_id, doc_title, doc_content, doc_key, doc_label " \ "FROM doc WHERE doc_id IN ({})".format(doc_ids) df2 = pd.read_sql_query( sql2, self.corpus.conn, ) df2.set_index('doc_id', inplace=True) df = df.join(df2) return df # todo: Put this in database? def get_doc_entropy(self): sql = "SELECT ROUND(topic_entropy, 1) as h, count() as n " \ "FROM doc GROUP BY h ORDER BY h" df = pd.read_sql_query(sql, self.model.conn) return df # todo: Put this in database def get_doc_entropy_avg(self): sql = "SELECT ROUND(AVG(topic_entropy), 1) as h_avg FROM doc" df = pd.read_sql_query(sql, self.model.conn) return df['h_avg'].tolist()[0] def test(self): return 1 def get_topicpair_matrix(self, sim=None, symmetric=True): """Get topic pair matrix by similarity or contiguity measure. sim values include cosim, jscore, and i_ab""" pairs = self.model.get_table('topicpair', set_index=True) if symmetric: tpm = pairs.append( pairs.reorder_levels(['topic_b_id', 'topic_a_id'])).unstack() else: tpm = pairs.unstack() if sim: return tpm[sim] else: return tpm def get_topicpair_net(self, thresh=.5, n=100): topics = self.model.get_table('topic') pairs = self.model.get_table('topicpair', set_index=False) # pairs = pairs.loc[pairs.i_ab >= thresh, ['topic_a_id', 'topic_b_id', 'i_ab']] pairs = pairs[pairs.i_ab >= thresh].sort_values( 'i_ab', ascending=False).head(n) # pairs = pairs.sort_values('i_ab', ascending=False).head(n) nodes = [{ 'id': t, 'title': '{1}'.format(t, topics.loc[t].topic_words), 'label': 'T{} {}'.format(t, topics.loc[t].topic_gloss), 'value': int((topics.loc[t].topic_alpha / topics.topic_alpha.max()) * 100) } for t in pd.concat([pairs.topic_a_id, pairs.topic_b_id], axis=0).unique()] edges = [{ 'from': int(pairs.loc[i].topic_a_id), 'to': int(pairs.loc[i].topic_b_id), 'value': float(pairs.loc[i].i_ab), 'label': '{}'.format(round(pairs.loc[i].i_ab, 2)) } for i in pairs.index] return nodes, edges def get_topics_related(self, topic_id): sql = """ SELECT topic_b_id as topic_id, jsd, jscore, p_ab, p_aGb, p_bGa, i_ab FROM topicpair WHERE topic_a_id = ? UNION ALL SELECT topic_a_id as topic_id, jsd, jscore, p_ab, p_aGb, p_bGa, i_ab FROM topicpair WHERE topic_b_id = ? ORDER BY topic_id """ df1 = pd.read_sql_query(sql, self.model.conn, params=(topic_id, topic_id), index_col='topic_id') return df1 def get_group_matrix(self, group_field): df = self.model.get_table('topic{}_matrix'.format(group_field)) return df.set_index('doc_group') def get_group_pairs(self, group_field): return self.model.get_table('topic{}_pairs'.format(group_field)) def get_group_counts(self, group_field): df = self.model.get_table('topic{}_matrix_counts'.format(group_field)) return df.set_index('doc_group') def get_group_topics(self, group_field, group_name): table_name = 'topic{}_matrix'.format(group_field) sql = 'SELECT * FROM {} WHERE doc_group = ?'.format(table_name) df = pd.read_sql_query(sql, self.model.conn, params=(group_name, )) df.set_index('doc_group', inplace=True) df = df.T df.index.name = 'topic_id' df.columns = ['topic_weight'] topics = self.model.get_table('topic', set_index=True) df['topic_gloss'] = topics.topic_gloss.tolist() df['label'] = 'T' + df.index + ' ' + df.topic_gloss return df def get_group_comps(self, group_field, group_name): table_name = 'topic{}_pairs'.format(group_field) sql1 = "SELECT group_b as 'doc_group', kld, jsd, jscore, euclidean " \ "FROM {} WHERE group_a = ?".format(table_name) sql2 = "SELECT group_a as 'doc_group', kld, jsd, jscore, euclidean " \ "FROM {} WHERE group_b = ?".format(table_name) df1 = pd.read_sql_query(sql1, self.model.conn, params=(group_name, )) df2 = pd.read_sql_query(sql2, self.model.conn, params=(group_name, )) return df1.append(df2).sort_values('doc_group').set_index('doc_group') def get_max_topic_weight(self): sql = "SELECT value as 'max_tw' FROM config WHERE key = 'doctopic_weight_max'" df = pd.read_sql_query(sql, self.model.conn) return df.max_tw.tolist()[0] ngram_prefixes = ['no', 'uni', 'bi', 'tri', 'quadri'] # Put in central place def get_docs_for_ngram(self, ngram, degree): my_type = self.ngram_prefixes[degree] sql = """SELECT doc.doc_id, doc.doc_title, count() as n FROM ngram{}doc JOIN doc USING(doc_id) WHERE ngram = ? GROUP BY doc.doc_id ORDER BY n DESC, doc_label, doc.doc_id LIMIT 100 """.format(my_type) df = pd.read_sql_query(sql, self.corpus.conn, params=(ngram, ), index_col='doc_id') return df def get_ngrams_per_group(self, ngram, degree, group_name='doc_label'): my_type = self.ngram_prefixes[degree] sql = """ SELECT DISTINCT d.doc_label as group_name, coalesce(n1, 0) as n FROM doc d LEFT JOIN ( SELECT {0}, COUNT() as n1 FROM ngram{1}doc JOIN doc USING(doc_id) WHERE ngram = ? GROUP BY {0} ) t USING({0}) ORDER BY group_name """.format(group_name, my_type) df = pd.read_sql_query(sql, self.corpus.conn, params=(ngram, )) return df def get_ngram_group_matrix(self, degree): my_type = self.ngram_prefixes[degree] try: ngm = self.corpus.get_table( 'ngram{}doc_group_matrix'.format(my_type)) ngm.set_index('ngram', inplace=True) except: mgm = pd.DataFrame() return ngm def get_pca_terms(self): try: df = self.corpus.get_table('pca_term', set_index='token_id') return df except: return None def get_pca_items(self): try: df = self.corpus.get_table('pca_item') #, set_index='pc_id') df['label'] = df['pc_id'].apply(lambda x: 'PC{}'.format(x), 1) return df except: return None def get_pca_docs(self, n=1000): """Grab a random sample of n documents for plotting""" sql1 = "ATTACH '{}' AS m".format(self.model.dbfile) sql2 = """ SELECT a.*, b.maxtopic FROM pca_doc a JOIN m.doc b ON a.doc_id = b.src_doc_id ORDER BY RANDOM() LIMIT ? """ try: self.corpus.conn.execute(sql1) except: print("Cant't attach topic model.") return None try: df = pd.read_sql_query(sql2, self.corpus.conn, params=(n, ), index_col='doc_id') print(df.head()) return df except sqlite3.Error as e: print(e) print("Can't get PCA docs") return None def get_topic_comp_net(self): comps = self.corpus.get_table('pca_item', set_index=True) topics = self.model.get_table('topic', set_index=True) poles = pd.read_sql_query('SELECT * FROM topiccomp_pole', self.model.conn, index_col='pc_id') # poles = self.model.get_table('topiccomp_pole', set_index=True) # WTF corrs = self.model.get_table('topiccomp_corr', set_index=True) tids = set( pd.concat([poles.max_pos_topic_id, poles.max_neg_topic_id]).values) nodes = [{ 'id': t, 'title': "{}".format(topics.loc[t].topic_words), 'label': 'T{} {}'.format(t, topics.loc[t].topic_gloss), 'value': int((topics.loc[t].topic_alpha / topics.topic_alpha.max()) * 100) } for t in tids] edges = [{ 'from': poles.loc[p].max_pos_topic_id, 'to': poles.loc[p].max_neg_topic_id, 'value': comps.loc[p, 'explained_variance'], 'label': "PC{}".format(p) } for p in poles.index] return nodes, edges def get_tsne_coords(self, join='left'): """Get the x and y values of the word_embeddings table to plot""" # todo: Convert into genuine error trap if join and join not in ('left', 'inner'): join = 'left' sql1 = """ SELECT token_str, tsne_x, tsne_y, token_count, pc_id, ROUND(pc_weight * 1000) as pc_w FROM word_embedding we JOIN token t USING (token_str) {} JOIN ( SELECT token_id, pc_id, pc_weight, MAX(ABS(pc_weight)) AS argmax FROM pca_term_narrow GROUP BY (token_id) ) pca USING (token_id) WHERE token_str IN ( SELECT token_str FROM token ORDER BY tfidf_sum DESC LIMIT 1000 ) """.format(join.upper()) df = pd.read_sql_query(sql1, self.corpus.conn) df['token_norm_count'] = np.round(np.log2( df['token_count'])**1.2).astype('int') df['pc_id'] = df['pc_id'].fillna(-1).astype('int') df.loc[df.pc_w > 0, 'symbol'] = 0 df.loc[df.pc_w <= 0, 'symbol'] = 1 # Experiment: these argmax values should be computed ahead of time and # added to the VOCAB tables # sql2 = """ # SELECT word_str as token_str, word_count as topic_token_count, # topic_alpha, topic_gloss, topic_id, MAX(word_count) AS topic_argmax # FROM topicword tw # JOIN word w USING(word_id) # JOIN topic t USING(topic_id) # GROUP BY word_id # """ # tw = pd.read_sql_query(sql2, self.model.conn) # df = df.merge(tw[['token_str', 'topic_token_count', 'topic_id', # 'topic_alpha', 'topic_gloss']], on='token_str', how='left') sql3 = """ SELECT word_str as token_str , topic_alpha, topic_gloss, topic_id FROM word w JOIN topic t ON (w.maxtopic = t.topic_id) """ tw = pd.read_sql_query(sql3, self.model.conn) df = df.merge( tw[['token_str', 'topic_id', 'topic_alpha', 'topic_gloss']], on='token_str', how='left') return df
class PoloReport(): def __init__(self, config, trial_name='trial1'): # Set some values if trial_name not in config.trials: raise ValueError("Invalid trail name `{}`.format(trial)") self.config = config self.trial = trial_name self.slug = self.config.ini['DEFAULT']['slug'] self.base_path = self.config.ini['DEFAULT']['base_path'] self.thresh = float(self.config.ini['DEFAULT']['thresh']) # Load the databases corpus_db_file = self.config.generate_corpus_db_file_path() model_db_file = self.config.generate_model_db_file_path(self.trial) self.corpus = PoloDb(corpus_db_file) self.model = PoloDb(model_db_file) # EXPERIMENTAL def get_row_count(self, table): n = pd.read_sql_query('SELECT count(*) AS n FROM {}'.format(table), self.corpus.conn).n.tolist()[0] return n def get_doc_count(self): self.doc_count = pd.read_sql_query('SELECT count(*) AS n FROM doc', self.corpus.conn).n.tolist()[0] return self.doc_count def get_topic_count(self): self.topic_count = pd.read_sql_query('SELECT count(*) AS n FROM topic', self.corpus.conn).n.tolist()[0] return self.topic_count def get_topic_list(self, by_alpha = True): topics = self.model.get_table('topic', set_index=True) alpha_max = topics.topic_alpha.max() alpha_min = topics.topic_alpha.min() from scipy import stats topics['topic_alpha_zscore'] = stats.zscore(topics.topic_alpha) num_topics = len(topics.index) sql = "SELECT topic_id, GROUP_CONCAT(topic_phrase, ', ') as phrases FROM topicphrase " \ "GROUP BY topic_id ORDER BY phrase_weight DESC" phrases = pd.read_sql_query(sql, self.model.conn) phrases.set_index('topic_id', inplace=True) cards = [] if by_alpha: topic_id_list = topics.topic_alpha.sort_values(ascending=False).index.tolist() else: topic_id_list = range(num_topics) for topic_id in topic_id_list: card = dict( topic_id = topic_id, topic_alpha = topics.loc[topic_id].topic_alpha, topic_alpha_zscore = topics.loc[topic_id].topic_alpha_zscore, topic_phrases = phrases.loc[topic_id].phrases, topic_words = topics.loc[topic_id].topic_words ) cards.append(card) return cards def get_prhases_for_topic(self, topic_id): sql = "SELECT topic_phrase FROM topicphrase WHERE topic_id = {} ORDER BY phrase_weight DESC".format(topic_id) phrases = ', '.join(pd.read_sql_query(sql, self.model.conn).topic_phrase.tolist()) return phrases def display_topic_list(self, by_alpha=True): topic_list = self.get_topic_list(by_alpha) df = pd.DataFrame(topic_list) return df.to_html() """
class PoloRetro: def __init__(self, config): self.config = config self.corpus = None self.model = None self.retro = None # todo: Rewrite as PoloCombiner or something and make this the init def retro_combine(self, corpus_dbfile, model_dbfile, retro_dbfile=None): self.corpus = PoloDb(corpus_dbfile) self.model = PoloDb(model_dbfile) if retro_dbfile == None: retro_dbfile = '{}-retro-combo.db'.format( self.config.ini['DEFAULT']['slug']) self.retro = PoloDb(retro_dbfile) self.create_retro_db() def create_all_tables(self): self.create_config_table() self.create_src_doc_meta_table() self.create_src_doc_table() self.create_word_table() self.create_doc_table() self.create_docword_table() self.create_topic_table() self.create_doctopic_table() self.create_doctopic_long_table() self.create_topicword_table() self.create_topicword_long_table() self.create_topicphrase_table() self.create_topicpair_table() self.create_topicpair_by_deps_table() #self.create_doctopic_sig_table() def create_doc_table(self): doc = self.model.get_table('doc') src_doc = self.corpus.get_table('doc') new_doc = pd.DataFrame(columns=['doc_id', 'doc_label', 'doc_str']) new_doc['doc_id'] = doc['doc_id'] doc.set_index('doc_id', inplace=True) src_doc.set_index('doc_id', inplace=True) new_doc.set_index('doc_id', inplace=True) new_doc['doc_label'] = doc.doc_label new_doc['doc_str'] = src_doc.doc_content self.retro.put_table(new_doc, 'doc', if_exists='replace', index=True) def create_src_doc_table(self): src_doc = self.corpus.get_table('doc') src_doc.set_index('doc_id', inplace=True) new_src_doc = pd.DataFrame( columns= 'src_meta_id doc_id doc_title doc_uri doc_label doc_ord doc_content doc_original doc_year doc_date doc_citation' .split()) new_src_doc['doc_id'] = src_doc.index new_src_doc.set_index('doc_id', inplace=True) new_src_doc['doc_title'] = src_doc.doc_title new_src_doc['doc_uri'] = src_doc.doc_key new_src_doc['doc_uri'] = new_src_doc['doc_uri'].apply( lambda x: self.config.ini['DEFAULT']['src_base_url'] + str(x)) new_src_doc['doc_label'] = src_doc.doc_label new_src_doc['doc_ord'] = None new_src_doc['doc_content'] = src_doc.doc_content new_src_doc['doc_original'] = src_doc.doc_original if 'doc_year' in src_doc.columns: new_src_doc['doc_year'] = src_doc.doc_year if 'doc_date' in src_doc.columns: new_src_doc['doc_date'] = src_doc.doc_date new_src_doc['doc_citation'] = None self.retro.put_table(new_src_doc, 'src_doc', if_exists='replace', index=True) def create_src_doc_meta_table(self): src_doc_meta = pd.DataFrame( { 'src_meta_id': self.config.ini['DEFAULT']['slug'], 'src_meta_desc': self.config.ini['DEFAULT']['title'], 'src_meta_base_url': self.config.ini['DEFAULT']['src_base_url'], 'src_meta_ord_type': None }, index=['src_meta_id' ]) # fixme: Need to add ord type to config and pass it self.retro.put_table(src_doc_meta, 'src_doc_meta', if_exists='replace') def create_word_table(self): word = self.corpus.get_table('token') new_word = pd.DataFrame( columns='word_id word_str word_freq word_stem'.split()) new_word['word_id'] = word.index new_word.set_index('word_id', inplace=True) new_word['word_str'] = word.token_str new_word['word_freq'] = word.token_count new_word['word_stem'] = None self.retro.put_table(new_word, 'word', if_exists='replace', index=True) def create_docword_table(self): sql = "SELECT dt.doc_id, t.ROWID as 'word_id', t.token_str as 'word_str', t.token_count as 'word_count', NULL as 'tfidf_weight' " \ "FROM doctoken dt JOIN token t USING(token_str)" new_docword = pd.read_sql_query(sql, self.corpus.conn) self.retro.put_table(new_docword, 'docword', if_exists='replace') def create_config_table(self): config = self.model.get_table('config') self.retro.put_table(config, 'config', if_exists='replace') def create_doctopic_table(self): doctopic = self.model.get_table('doctopic') doctopic['topic_label'] = doctopic['topic_id'].apply( lambda x: 't{}'.format(x)) doctopic = doctopic[['doc_id', 'topic_label', 'topic_weight']] doctopic.set_index(['doc_id', 'topic_label'], inplace=True) doctopic_wide = doctopic.unstack().reset_index() doctopic_wide.columns = doctopic_wide.columns.droplevel(0) doctopic_wide.rename(columns={'': 'doc_id'}, inplace=True) doc = self.model.get_table('doc') doc.set_index('doc_id', inplace=True) doctopic_wide = doctopic_wide.join(doc[['topic_entropy', 'doc_label']], how='left') self.retro.put_table(doctopic_wide, 'doctopic', if_exists='replace') def create_topic_table(self): topic = self.model.get_table('topic') new_topic = pd.DataFrame( columns='topic_id topic_alpha total_tokens topic_words'.split()) new_topic['topic_id'] = topic.topic_id new_topic['topic_alpha'] = topic.topic_alpha new_topic['topic_words'] = topic.topic_words new_topic['total_tokens'] = topic.topic_tokens self.retro.put_table(new_topic, 'topic', if_exists='replace') def create_topicphrase_table(self): topicphrase = self.model.get_table('topicphrase') self.retro.put_table(topicphrase, 'topicphrase', if_exists='replace') def create_topicword_table(self): topicword = self.model.get_table('topicword') word = self.model.get_table('word') topicword['word_count'] = topicword['word_count'].astype(int) topicword['topic_label'] = topicword['topic_id'].apply( lambda x: 't{}'.format(x)) topicword = topicword[['word_id', 'topic_label', 'word_count']] topicword.set_index(['word_id', 'topic_label'], inplace=True) topicword_wide = topicword.unstack().reset_index() topicword_wide.columns = topicword_wide.columns.droplevel(0) topicword_wide.rename(columns={'': 'word_id'}, inplace=True) topicword_wide.fillna(0, inplace=True) topicword_wide.set_index('word_id', inplace=True) word.set_index('word_id', inplace=True) topicword_wide['word_str'] = word.word_str self.retro.put_table(topicword_wide, 'topicword', if_exists='replace', index=True) def create_doctopic_long_table(self): doctopic = self.model.get_table('doctopic') self.retro.put_table(doctopic, 'doctopic_long', if_exists='replace') def create_topicword_long_table(self): topicword = self.model.get_table('topicword') word = self.model.get_table('word') topicword['word_count'] = topicword['word_count'].astype(int) word.set_index('word_id', inplace=True) topicword.set_index(['word_id', 'topic_id'], inplace=True) topicword = topicword.join(word, how='left') self.retro.put_table(topicword, 'topicword_long', if_exists='replace', index=True) def create_topicpair_table(self): topicpair = self.model.get_table('topicpair') new_tp = pd.DataFrame( columns='topic_id1 topic_id2 cosine_sim js_div'.split()) new_tp['topic_id1'] = topicpair.topic_a_id new_tp['topic_id2'] = topicpair.topic_b_id new_tp['cosine_sim'] = topicpair.cosim new_tp['js_div'] = topicpair.jsd self.retro.put_table(new_tp, 'topicpair', if_exists='replace') def create_topicpair_by_deps_table(self): topicpair = self.model.get_table('topicpair') topic = self.model.get_table('topic') topicpair = topicpair.merge(topic[['topic_id', 'topic_rel_freq']], left_on='topic_a_id', right_on='topic_id', how='inner') topicpair = topicpair.merge(topic[['topic_id', 'topic_rel_freq']], left_on='topic_b_id', right_on='topic_id', how='inner') new_tp = pd.DataFrame( columns='topic_a topic_b p_a p_b p_ab p_aGb p_bGa i_ab'.split()) new_tp['topic_a'] = topicpair.topic_a_id new_tp['topic_b'] = topicpair.topic_b_id new_tp['p_a'] = topicpair.topic_rel_freq_x new_tp['p_b'] = topicpair.topic_rel_freq_y new_tp['p_ab'] = topicpair.p_ab new_tp['p_aGb'] = topicpair.p_aGb new_tp['p_bGa'] = topicpair.p_bGa new_tp['i_ab'] = topicpair.i_ab self.retro.put_table(new_tp, 'topicpair_by_deps') def create_doctopic_sig_table(self): pass # fixme: The sql for tables with topics for columns need to be generated! def create_retro_db(self): sql_creators = """ CREATE TABLE IF NOT EXISTS src_doc_meta (src_meta_id TEXT,src_meta_desc TEXT,src_meta_base_url TEXT,src_meta_ord_type TEXT); CREATE TABLE IF NOT EXISTS src_doc (src_meta_id TEXT,doc_id INTEGER PRIMARY KEY,doc_title TEXT,doc_uri TEXT UNIQUE,doc_label TEXT,doc_ord INTEGER,doc_content TEXT,doc_original TEXT,doc_year INTEGER,doc_date TEXT,doc_citation TEXT); CREATE TABLE IF NOT EXISTS word (word_id INTEGER PRIMARY KEY,word_str TEXT,word_freq INTEGER,word_stem TEXT); CREATE TABLE IF NOT EXISTS doc (doc_id INTEGER PRIMARY KEY,doc_label TEXT,doc_str TEXT); CREATE TABLE IF NOT EXISTS docword (doc_id INTEGER,word_id INTEGER,word_str TEXT,word_count INTEGER,tfidf_weight REAL); CREATE TABLE IF NOT EXISTS config (key TEXT, value TEXT); CREATE TABLE IF NOT EXISTS topic (topic_id INTEGER PRIMARY KEY, topic_alpha REAL, total_tokens INTEGER, topic_words TEXT); CREATE TABLE IF NOT EXISTS topicphrase (topic_id INTEGER, topic_phrase TEXT, phrase_count INTEGER, phrase_weight REAL); CREATE TABLE IF NOT EXISTS doctopic_long (doc_id INTEGER NOT NULL, topic_id INTEGER NOT NULL, topic_weight REAL NOT NULL, UNIQUE (doc_id, topic_id)); CREATE TABLE IF NOT EXISTS topicword_long (word_id INTEGER NOT NULL, word_str TEXT NOT NULL, topic_id INTEGER NOT NULL, word_count INTEGER NOT NULL, UNIQUE (word_id, topic_id)); CREATE TABLE IF NOT EXISTS topicpair (topic_id1 INTEGER, topic_id2 INTEGER, cosine_sim REAL, js_div REAL); CREATE TABLE IF NOT EXISTS topicpair_by_deps (topic_a INTEGER, topic_b INTEGER, p_a REAL, p_b REAL, p_ab REAL, p_aGb REAL, p_bGa REAL, i_ab REAL); CREATE TABLE IF NOT EXISTS doctopic_sig (doc_id INTEGER PRIMARY KEY, topic_sig TEXT, topic_sig_sorted TEXT, topic_n INTEGER); """.split(';') # Handle wide tables topic = self.model.get_table('topic') n_topics = len(topic.topic_id.tolist()) topic_fields_real = ','.join( ['t{} REAL'.format(tn) for tn in range(n_topics)]) topic_fields_int = ','.join( ['t{} INTEGER'.format(tn) for tn in range(n_topics)]) sql_creators.append( "CREATE TABLE IF NOT EXISTS doctopic (doc_id INTEGER PRIMARY KEY, doc_label TEXT, topic_entropy REAL, {})" .format(topic_fields_real)) sql_creators.append( "CREATE TABLE IF NOT EXISTS topicword (word_id INTEGER, word_str TEXT, {})" .format(topic_fields_int)) for sql_create in sql_creators: self.retro.conn.execute(sql_create)
def __init__(self, conifg, corpus_dbfile=None): self.gs_corpus = None self.gs_dict = None self.db = PoloDb(corpus_dbfile)