Пример #1
0
    def create_topicdoc_group_matrix(self, group_field='doc_label'):
        """Create topicdoc group matrix table"""

        # Get source doc table
        corpus_db_file = self.config.generate_corpus_db_file_path()
        corpus = PoloDb(corpus_db_file)
        src_docs = corpus.get_table('doc')
        if group_field not in src_docs.columns:
            raise ValueError('Column `{}` does not exist on corpus doc table.'.format(group_field))
        src_docs.rename(columns={'doc_id':'src_doc_id'}, inplace=True)
        del corpus

        # Add the model doc_id to src_doc
        docs = self.get_table('doc')
        src_docs = src_docs.merge(docs[['doc_id', 'src_doc_id']], on='src_doc_id', how='right')
        src_docs.set_index('doc_id', inplace=True) # Change index to align with doctopics
        del docs

        # Get doctopic table
        doctopics = self.get_table('doctopic', set_index=True)
        dtw = doctopics['topic_weight'].unstack()
        del doctopics

        dtw['doc_group'] = src_docs[group_field]
        dtg = dtw.groupby('doc_group')
        dtm = dtg.mean().fillna(0)
        if dtm.columns.nlevels == 2:
            dtm.columns = dtm.columns.droplevel(0)
        self.put_table(dtm, 'topic{}_matrix'.format(group_field), index=True)
        dtm_counts = dtg[0].count().fillna(0)
        dtm_counts.name = 'doc_count'
        self.put_table(dtm_counts, 'topic{}_matrix_counts'.format(group_field), index=True)
Пример #2
0
    def add_topiccompcorr(self):
        """Add topic component correlation table"""
        corpus_db_file = self.config.generate_corpus_db_file_path()
        corpus = PoloDb(corpus_db_file)
        pca_doc = corpus.get_table('pca_doc')
        del(corpus)
        pca_doc = pca_doc.set_index('doc_id')
        sql = """
        SELECT a.src_doc_id AS doc_id, topic_id, topic_weight  
        FROM doc a 
        JOIN doctopic b USING(doc_id)
        """
        doctopic = pd.read_sql_query(sql, self.conn, index_col=['doc_id', 'topic_id'])
        dtm = doctopic.unstack()
        dtm.columns = dtm.columns.droplevel(0)
        # dtm.columns = ["T{0}".format(col) for col in  dtm.columns]
        X = dtm.T.dot(pca_doc)
        self.put_table(X, 'topiccomp_corr', index=True)

        # Add topic poles
        A = X.idxmax()
        B = X.idxmin()
        C = pd.concat([A,B], 1)
        C.columns = ['max_pos_topic_id','max_neg_topic_id']
        C.index = [int(idx.replace('PC','')) for idx in C.index]
        C.index.name  = 'pc_id'
        self.put_table(C, 'topiccomp_pole', index=True)
Пример #3
0
    def __init__(self, config, trial='trial1'):
        """Initialize MALLET with trial name"""
        if trial not in config.trials:
            raise ValueError("Invalid trail name `{}`.format(trial)")

        self.config = config
        self.trial = trial
        self.config.set_config_attributes(self) # Prefixes keys with cfg_
        self.config.set_config_attributes(self, self.trial)

        # todo: Put this in config.ini
        self.cfg_tw_quantile = 0.8

        # Temporary hack to handle casting
        for key in "num_topics num_iterations optimize_interval num_threads num_top_words".split():
            att = 'cfg_{}'.format(key)
            setattr(self, att, int(getattr(self, att)))
        self.cfg_thresh = float(self.cfg_thresh)

        # Get replacment files
        # todo: Fix order; higher ngrams should go first ... argues for sortable names
        self.replacement_files = self.cfg_replacements
        for filename in os.listdir('corpus'):
            if 'replacements_' in filename:
                self.replacement_files += ' corpus/' + filename

        self.trial_name = self.trial  # HACK
        self.file_prefix = '{}/{}'.format(self.cfg_mallet_out_dir, self.trial_name)
        self.mallet = {'import-file': {}, 'train-topics': {}}
        self.mallet_init()

        dbfile = self.config.generate_model_db_file_path(self.trial)
        PoloDb.__init__(self, dbfile)
Пример #4
0
    def __init__(self, config, trial_name='trial1'):

        # Set some values
        if trial_name not in config.trials:
            raise ValueError("Invalid trail name `{}`.format(trial)")
        self.config = config
        self.trial = trial_name
        self.slug = self.config.ini['DEFAULT']['slug']
        self.base_path = self.config.ini['DEFAULT']['base_path']
        self.thresh = float(self.config.ini['DEFAULT']['thresh'])

        # Load the databases
        corpus_db_file = self.config.generate_corpus_db_file_path()
        model_db_file = self.config.generate_model_db_file_path(self.trial)
        self.corpus = PoloDb(corpus_db_file)
        self.model = PoloDb(model_db_file)
Пример #5
0
    def create_topicdoc_col_matrix(self, group_col):
        """Create topicdoc matrix table for a group column"""

        # Get source doc table
        corpus_db_file = self.config.generate_corpus_db_file_path()
        corpus = PoloDb(corpus_db_file)
        src_docs = corpus.get_table('doc')
        src_docs.rename(columns={'doc_id':'src_doc_id'}, inplace=True)
        del corpus

        # Add the model doc_id to src_doc
        docs = self.get_table('doc')
        src_docs = src_docs.merge(docs[['doc_id', 'src_doc_id']], on='src_doc_id', how='right')
        src_docs.set_index('doc_id', inplace=True) # Change index to align with doctopics
        del docs

        # Get doctopic table
        # thresh = self.get_thresh()
        # doctopics = pd.read_sql_query('SELECT * \
        # FROM doctopic WHERE topic_weight >= ?', self.conn, params=(thresh,))
        # doctopics.set_index(['doc_id', 'topic_id'], inplace=True)
        doctopics = self.get_table('doctopic', set_index=True)
        dtw = doctopics['topic_weight'].unstack()
        del doctopics

        # todo: Streamline the logic here
        if group_col == 'ord':
            doc_col = self.config.ini['DEFAULT']['src_ord_col']
        elif group_col == 'label':
            doc_col = 'doc_label'
        else:
            group_col = 'ord'
            doc_col = self.config.ini['DEFAULT']['src_ord_col']

        dtw['doc_group'] = src_docs[doc_col]
        dtg = dtw.groupby('doc_group')
        dtm = dtg.mean().fillna(0)
        if dtm.columns.nlevels == 2:
            dtm.columns = dtm.columns.droplevel(0)
        self.put_table(dtm, 'topicdoc{}_matrix'.format(group_col), index=True)
        dtm_counts = dtg[0].count().fillna(0)
        dtm_counts.name = 'doc_count'
        self.put_table(dtm_counts, 'topicdoc{}_matrix_counts'.format(group_col), index=True)
Пример #6
0
 def retro_combine(self, corpus_dbfile, model_dbfile, retro_dbfile=None):
     self.corpus = PoloDb(corpus_dbfile)
     self.model = PoloDb(model_dbfile)
     if retro_dbfile == None:
         retro_dbfile = '{}-retro-combo.db'.format(
             self.config.ini['DEFAULT']['slug'])
     self.retro = PoloDb(retro_dbfile)
     self.create_retro_db()
Пример #7
0
    def __init__(self, config):
        """Initialize corpus object"""

        # Import Configs
        self.config = config
        self.config.set_config_attributes(self)
        if not os.path.isfile(self.cfg_src_file_name):
            raise ValueError(
                "Missing source file. Check value of `src_file_name` in INI file."
            )
        self.dbfile = config.generate_corpus_db_file_path()
        PoloDb.__init__(self, self.dbfile)

        # self.db = PoloDb(self.dbfile) # Why not do this?

        if self.cfg_nltk_data_path:
            nltk.data.path.append(self.cfg_nltk_data_path)

        # For tokenizing into sentences
        # fixme: TOKENIZER ASSUMES ENGLISH -- PARAMETIZE THIS
        nltk.download('punkt')
        nltk.download('tagsets')
        nltk.download('averaged_perceptron_tagger')
        self.tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
Пример #8
0
class PoloGensim:
    def __init__(self, conifg, corpus_dbfile=None):
        self.gs_corpus = None
        self.gs_dict = None
        self.db = PoloDb(corpus_dbfile)

    def make_gs_corpus(self):
        doctokenbow = self.db.get_table('doctokenbow')
        doctokenbow.set_index('doc_id', inplace=True)
        self.gs_corpus = [[
            (row[0], row[1])
            for row in doctokenbow.loc[doc_id == doc_id,
                                       ['token_id', 'token_count']].values
        ] for doc_id in doctokenbow.index.unique()]
        """
        # Old school loop way
        self.gs_corpus = []
        for doc_id in doctokenbow.index.unique():
            doc = []
            for row in doctokenbow.loc[doc_id, ['token_id', 'token_count']].values:
                doc.append((row[0], row[1]))
            self.gs_corpus.append(doc)
        """

    def make_gs_dict(self):
        token = self.db.get_table('token')
        self.gs_dict = {
            row[0]: row[1]
            for row in token[['token_id', 'token_str']].values
        }

    def get_hdp(self):
        hdp = models.HdpModel(self.gs_corpus, self.gs_dict)
        hdp_topics = hdp.get_topics()
        hdp_df = pd.DataFrame(hdp_topics)
        hdp_dfn = pd.DataFrame(hdp_df.unstack())
        hdp_dfn.reset_index(inplace=True)
        hdp_dfn.columns = ['token_id', 'topic_id', 'token_freq']
        self.db.put_table(hdp_dfn, 'hdp', if_exists='replace')

        # todo: Go the next step and extract topic with word with freqs above a thresh
        thresh = 0.0005
        # Sometimes it's easier to use SQL than to figure out how to something
        # like this in Pandas
        sql = """
        SELECT topic_id, GROUP_CONCAT(token_str, ' ') AS top_words
        FROM ( SELECT topic_id, token_id FROM hdp WHERE token_freq > {} ORDER BY topic_id, token_freq DESC )
        JOIN token USING (token_id)
        GROUP BY topic_id
        """.format(thresh)
        hdp_topics = pd.read_sql_query(sql, self.db.conn)
        self.db.put_table(hdp_topics, 'hdp_topics')

        thresh = 0.005  # Note this is different from what's in config.ini
Пример #9
0
def get_model_db(slug, trial):
    pcfg = get_project_config(slug)
    model_db_file = pcfg.generate_model_db_file_path(trial)
    model = PoloDb(model_db_file)
    return model
Пример #10
0
def get_corpus_db(slug):
    pcfg = get_project_config(slug)
    corpus_db_file = pcfg.generate_corpus_db_file_path()
    corpus = PoloDb(corpus_db_file)
    return corpus
Пример #11
0
 def __init__(self, config):
     corpus_db_file = self.config.generate_corpus_db_file_path()
     self.corpus = PoloDb(corpus_db_file)
Пример #12
0
class Elements(object):
    def __init__(self, config, trial_name='trial1'):

        # Set some values
        if trial_name not in config.trials:
            raise ValueError("Invalid trail name `{}`.format(trial)")
        self.config = config
        self.trial = trial_name
        self.slug = self.config.ini['DEFAULT']['slug']
        self.base_path = self.config.ini['DEFAULT']['base_path']
        self.thresh = float(self.config.ini['DEFAULT']['thresh'])

        # Load the databases
        corpus_db_file = self.config.generate_corpus_db_file_path()
        model_db_file = self.config.generate_model_db_file_path(self.trial)
        self.corpus = PoloDb(corpus_db_file)
        self.model = PoloDb(model_db_file)

    def get_table(self, table_name, db_conn):
        df = self.get_sql("SELECT * FROM {}".format(table_name), db_conn)
        return df

    def get_sql(self, query, db_conn, params=()):
        try:
            df = pd.read_sql_query(query, db_conn, params=params)
            return df
        except:
            return None

    def get_doc_count(self):
        self.doc_count = pd.read_sql_query('SELECT count(*) AS n FROM doc',
                                           self.corpus.conn).n.tolist()[0]
        return self.doc_count

    def get_topic_count(self):
        self.topic_count = pd.read_sql_query('SELECT count(*) AS n FROM topic',
                                             self.model.conn).n.tolist()[0]
        return self.topic_count

    def get_topic(self, topic_id):
        topic_id = int(topic_id)
        sql = 'SELECT * FROM topic WHERE topic_id = ?'
        df = pd.read_sql_query(sql, self.model.conn, params=(topic_id, ))
        df.set_index('topic_id', inplace=True)
        df['topic_phrases'] = self.get_topic_phrases(topic_id)
        return df

    def get_topics(self):
        topics = self.model.get_table('topic', set_index=True)
        topics['topic_alpha_zsign'] = topics.topic_alpha_zscore.apply(
            lambda x: 'pos' if x > 0 else 'neg')
        alpha_max = topics.topic_alpha.max()
        topics['topic_alpha_percent'] = ((topics.topic_alpha / alpha_max) *
                                         100).astype(int)
        topic_phrases = self.model.get_table('topicphrase')
        topics['topic_phrases'] = topic_phrases.groupby('topic_id').apply(
            lambda x: ', '.join(x.topic_phrase))
        return topics

    def get_top_bigrams(self, limit=50):
        limit = int(limit)
        sql = "SELECT ngram, ngram_count, score FROM ngrambi " \
              "ORDER BY score DESC LIMIT {}".format(limit)
        df = pd.read_sql_query(sql, self.corpus.conn)
        df['ngram_percent'] = (df.score / df.score.max() * 100).astype('int')
        return df

    def get_topic_phrases(self, topic_id):
        topic_id = int(topic_id)
        sql = "SELECT topic_phrase FROM topicphrase " \
              "WHERE topic_id = ? ORDER BY phrase_weight DESC"
        phrases = ', '.join(
            pd.read_sql_query(sql, self.model.conn,
                              params=(topic_id, )).topic_phrase.tolist())
        return phrases

    def get_all_topic_phrases(self, limit=20):
        sql = """
        select topic_phrase, 
            sum(phrase_count) as n, 
            avg(phrase_weight) as w,
            group_concat(topic_id, ',') as topics
        from topicphrase
        group by topic_phrase
        order by n desc
        limit ?
        """
        return pd.read_sql_query(sql, self.model.conn, params=(limit, ))

    def get_topic_entropy_hist(self):
        doctopics = self.model.get_table('doctopic', set_index=True)
        doctopics.unstack()

    # fixme: Deprecated function
    def get_topicdoclabel_matrix(self, sort_by_alpha=True):
        dtm = self.model.get_table('topicdoclabel_matrix', set_index=False)
        col1 = dtm.columns.tolist()[0]
        dtm.set_index(col1, inplace=True)

        topics = self.model.get_table('topic', set_index=True)
        if sort_by_alpha:
            topics = topics.sort_values('topic_alpha', ascending=True)
        dtm = dtm[topics.index.astype('str').tolist()]
        if 'topic_gloss' in topics.columns:
            dtm.columns = topics.reset_index().apply(
                lambda x: 'T{} {}'.format(x.topic_id, x.topic_gloss), axis=1)
        else:
            dtm.columns = topics.reset_index().apply(
                lambda x: 'T{} {}'.format(x.topic_id, x.topic_words), axis=1)
        return dtm

    def get_topicdoc_group_matrix(self,
                                  sort_by_alpha=True,
                                  group_field='doc_label',
                                  use_gloss_label=False):
        dtm = self.model.get_table(
            'topic{}_matrix'.format(group_field),
            set_index=False)  # todo: Should be schema driven
        col1 = dtm.columns.tolist()[
            0]  # todo: Should always be doc_group; should be schema driven
        dtm.set_index(col1, inplace=True)

        topics = self.model.get_table('topic', set_index=True)
        if sort_by_alpha:
            topics = topics.sort_values('topic_alpha', ascending=True)
        dtm = dtm[topics.index.astype('str').tolist()]

        if use_gloss_label:
            if 'topic_gloss' in topics.columns:
                dtm.columns = topics.reset_index().apply(
                    lambda x: 'T{} {}'.format(x.topic_id, x.topic_gloss),
                    axis=1)
            else:
                dtm.columns = topics.reset_index().apply(
                    lambda x: 'T{} {}'.format(x.topic_id, x.topic_words),
                    axis=1)

        dtm.index = dtm.index.map(
            str)  # This is to prevent templates from crashing
        return dtm

    def get_topicdocgrooup_counts(self, table_name):
        doc_counts = pd.DataFrame(self.model.get_table(table_name))
        doc_counts.set_index('doc_group', inplace=True)
        return doc_counts

    def get_topicdoc_sum_matrix(self, dtm, group_counts):
        df = dtm.apply(lambda x: x * group_counts.doc_count.values, axis=0)
        return df

    def get_topicdoc_ord_for_topic(self, topic_id):
        topic_id = int(topic_id)
        #doc_col = self.config.ini['DEFAULT']['src_ord_col']
        src_ord_col = self.config.ini['DEFAULT']['src_ord_col']
        table_name = 'topic{}_matrix'.format(src_ord_col)
        sql = "SELECT  doc_group, `{1}` as topic_weight FROM {0} ORDER BY doc_group".format(
            table_name, topic_id)
        df = pd.read_sql_query(sql, self.model.conn)
        return df

    def get_doc(self, src_doc_id):

        sql1 = "SELECT * FROM doc WHERE src_doc_id = ?"
        df = pd.read_sql_query(sql1, self.model.conn, params=(src_doc_id, ))
        df.set_index('src_doc_id', inplace=True)

        sql2 = "SELECT * FROM doc WHERE doc_id = ?"
        df2 = pd.read_sql_query(sql2, self.corpus.conn, params=(src_doc_id, ))
        df2.set_index('doc_id', inplace=True)
        #df2.index.name = 'src_doc_id' # todo: Fix this madness

        df = df.join(df2, lsuffix='_SRC')
        return df

    def get_doc_id_for_src_doc_id(self, src_doc_id):
        sql = "SELECT doc_id FROM doc WHERE src_doc_id = ?"
        df = pd.read_sql_query(sql, self.model.conn, params=(src_doc_id, ))
        doc_id = df.doc_id.tolist()[0]
        return doc_id

    def get_topics_for_doc_id(self, doc_id):
        sql = "SELECT * FROM doctopic WHERE doc_id = ?"
        df = pd.read_sql_query(sql, self.model.conn, params=(doc_id, ))
        return df

    def get_docs_for_topic(self, topic_id, limit=10):
        sql = "SELECT src_doc_id, topic_weight, topic_weight_zscore FROM doctopic " \
              "JOIN doc USING(doc_id) WHERE topic_id = ? " \
              "ORDER BY topic_weight DESC LIMIT {}".format(limit)
        df = pd.read_sql_query(sql, self.model.conn, params=(topic_id, ))
        df.set_index('src_doc_id', inplace=True)
        doc_ids = ','.join(df.index.astype('str').tolist())
        sql2 = "SELECT doc_id, doc_label, doc_title, " \
               "doc_content as doc_original, doc_content, doc_key " \
               "FROM doc WHERE doc_id IN ({})".format(doc_ids)
        df2 = pd.read_sql_query(
            sql2,
            self.corpus.conn,
        )
        df2.set_index('doc_id', inplace=True)
        df = df.join(df2)
        return df

    def get_docs_for_topic_and_label(self,
                                     topic_id,
                                     doc_col_value,
                                     doc_col=None,
                                     limit=100):
        if not doc_col:
            doc_col = self.config.ini['DEFAULT'][
                'src_ord_col']  # Should wrap these calls with a method
        df = pd.read_sql_query(
            "SELECT doc_id, doc_title, doc_content, doc_key  FROM doc WHERE {} = ? LIMIT {}"
            .format(doc_col, limit),
            self.corpus.conn,
            params=(doc_col_value, ))
        df.set_index('doc_id', inplace=True)
        doc_ids = ','.join(df.index.astype('str').tolist())
        sql2 = "SELECT d.*, topic_weight FROM doc d " \
               "JOIN doctopic dt USING(doc_id) " \
               "WHERE src_doc_id IN ({}) AND topic_id = ? " \
               "ORDER BY topic_weight DESC LIMIT 10 ".format(doc_ids)
        df2 = pd.read_sql_query(sql2, self.model.conn, params=(topic_id, ))
        df2.set_index('src_doc_id', inplace=True)
        df = df.join(df2)
        return df.sort_values('topic_weight', ascending=False)

    def get_docs_for_group(self, group_field_value, group_field='doc_label'):
        sql1 = "SELECT * FROM doc WHERE {} = ? LIMIT 500".format(group_field)
        df = pd.read_sql_query(sql1,
                               self.corpus.conn,
                               params=(group_field_value, ))
        df.set_index('doc_id', inplace=True)
        src_doc_ids = ','.join(df.index.astype('str').tolist())
        sql2 = "SELECT * FROM doc WHERE src_doc_id IN ({})".format(src_doc_ids)
        df2 = pd.read_sql_query(sql2, self.model.conn)
        df2.set_index('src_doc_id', inplace=True)
        df = df.join(df2, rsuffix='_SRC')
        return df

    def get_docs_for_topic_entropy(self, topic_entropy, limit=100):
        topic_entropy_min = float(topic_entropy) - .05
        topic_entropy_max = float(topic_entropy) + .05
        sql = "SELECT src_doc_id, topic_entropy, topic_entropy_zscore FROM doc " \
              "WHERE topic_entropy >= ? AND topic_entropy < ? " \
              "ORDER BY src_doc_id LIMIT {} ".format(limit)
        df = pd.read_sql_query(sql,
                               self.model.conn,
                               params=(topic_entropy_min, topic_entropy_max))
        df.set_index('src_doc_id', inplace=True)
        doc_ids = ','.join(df.index.astype('str').tolist())
        sql2 = "SELECT doc_id, doc_title, doc_content, doc_key, doc_label " \
               "FROM doc WHERE doc_id IN ({})".format(doc_ids)
        df2 = pd.read_sql_query(
            sql2,
            self.corpus.conn,
        )
        df2.set_index('doc_id', inplace=True)
        df = df.join(df2)
        return df

    # todo: Put this in database?
    def get_doc_entropy(self):
        sql = "SELECT ROUND(topic_entropy, 1) as h, count() as n " \
              "FROM doc GROUP BY h ORDER BY h"
        df = pd.read_sql_query(sql, self.model.conn)
        return df

    # todo: Put this in database
    def get_doc_entropy_avg(self):
        sql = "SELECT ROUND(AVG(topic_entropy), 1) as h_avg FROM doc"
        df = pd.read_sql_query(sql, self.model.conn)
        return df['h_avg'].tolist()[0]

    def test(self):
        return 1

    def get_topicpair_matrix(self, sim=None, symmetric=True):
        """Get topic pair matrix by similarity or contiguity measure.
         sim values include cosim, jscore, and i_ab"""
        pairs = self.model.get_table('topicpair', set_index=True)
        if symmetric:
            tpm = pairs.append(
                pairs.reorder_levels(['topic_b_id', 'topic_a_id'])).unstack()
        else:
            tpm = pairs.unstack()

        if sim:
            return tpm[sim]
        else:
            return tpm

    def get_topicpair_net(self, thresh=.5, n=100):
        topics = self.model.get_table('topic')
        pairs = self.model.get_table('topicpair', set_index=False)
        # pairs = pairs.loc[pairs.i_ab >= thresh, ['topic_a_id', 'topic_b_id', 'i_ab']]
        pairs = pairs[pairs.i_ab >= thresh].sort_values(
            'i_ab', ascending=False).head(n)
        # pairs = pairs.sort_values('i_ab', ascending=False).head(n)
        nodes = [{
            'id':
            t,
            'title':
            '{1}'.format(t, topics.loc[t].topic_words),
            'label':
            'T{} {}'.format(t, topics.loc[t].topic_gloss),
            'value':
            int((topics.loc[t].topic_alpha / topics.topic_alpha.max()) * 100)
        }
                 for t in pd.concat([pairs.topic_a_id, pairs.topic_b_id],
                                    axis=0).unique()]
        edges = [{
            'from': int(pairs.loc[i].topic_a_id),
            'to': int(pairs.loc[i].topic_b_id),
            'value': float(pairs.loc[i].i_ab),
            'label': '{}'.format(round(pairs.loc[i].i_ab, 2))
        } for i in pairs.index]
        return nodes, edges

    def get_topics_related(self, topic_id):
        sql = """
        SELECT topic_b_id as topic_id, jsd, jscore, p_ab, p_aGb, p_bGa, i_ab 
        FROM topicpair WHERE topic_a_id = ?
        UNION ALL
        SELECT topic_a_id as topic_id, jsd, jscore, p_ab, p_aGb, p_bGa, i_ab 
        FROM topicpair WHERE topic_b_id = ?
        ORDER BY topic_id
        """
        df1 = pd.read_sql_query(sql,
                                self.model.conn,
                                params=(topic_id, topic_id),
                                index_col='topic_id')
        return df1

    def get_group_matrix(self, group_field):
        df = self.model.get_table('topic{}_matrix'.format(group_field))
        return df.set_index('doc_group')

    def get_group_pairs(self, group_field):
        return self.model.get_table('topic{}_pairs'.format(group_field))

    def get_group_counts(self, group_field):
        df = self.model.get_table('topic{}_matrix_counts'.format(group_field))
        return df.set_index('doc_group')

    def get_group_topics(self, group_field, group_name):
        table_name = 'topic{}_matrix'.format(group_field)
        sql = 'SELECT * FROM {} WHERE doc_group = ?'.format(table_name)
        df = pd.read_sql_query(sql, self.model.conn, params=(group_name, ))
        df.set_index('doc_group', inplace=True)
        df = df.T
        df.index.name = 'topic_id'
        df.columns = ['topic_weight']
        topics = self.model.get_table('topic', set_index=True)
        df['topic_gloss'] = topics.topic_gloss.tolist()
        df['label'] = 'T' + df.index + ' ' + df.topic_gloss
        return df

    def get_group_comps(self, group_field, group_name):
        table_name = 'topic{}_pairs'.format(group_field)
        sql1 = "SELECT group_b as 'doc_group', kld, jsd, jscore, euclidean " \
               "FROM {} WHERE group_a = ?".format(table_name)
        sql2 = "SELECT group_a as 'doc_group', kld, jsd, jscore, euclidean " \
               "FROM {} WHERE group_b = ?".format(table_name)
        df1 = pd.read_sql_query(sql1, self.model.conn, params=(group_name, ))
        df2 = pd.read_sql_query(sql2, self.model.conn, params=(group_name, ))
        return df1.append(df2).sort_values('doc_group').set_index('doc_group')

    def get_max_topic_weight(self):
        sql = "SELECT value as 'max_tw' FROM config WHERE key = 'doctopic_weight_max'"
        df = pd.read_sql_query(sql, self.model.conn)
        return df.max_tw.tolist()[0]

    ngram_prefixes = ['no', 'uni', 'bi', 'tri',
                      'quadri']  # Put in central place

    def get_docs_for_ngram(self, ngram, degree):
        my_type = self.ngram_prefixes[degree]
        sql = """SELECT doc.doc_id, doc.doc_title, count() as n 
        FROM ngram{}doc 
        JOIN doc USING(doc_id) 
        WHERE ngram = ?
        GROUP BY doc.doc_id
        ORDER BY n DESC, doc_label, doc.doc_id 
        LIMIT 100 """.format(my_type)
        df = pd.read_sql_query(sql,
                               self.corpus.conn,
                               params=(ngram, ),
                               index_col='doc_id')
        return df

    def get_ngrams_per_group(self, ngram, degree, group_name='doc_label'):
        my_type = self.ngram_prefixes[degree]
        sql = """
        SELECT DISTINCT d.doc_label as group_name,  coalesce(n1, 0) as n 
        FROM doc d
        LEFT JOIN (
            SELECT {0}, COUNT() as n1 
            FROM ngram{1}doc 
            JOIN doc USING(doc_id) WHERE ngram = ?
            GROUP BY {0}
        ) t USING({0})
        ORDER BY group_name
        """.format(group_name, my_type)
        df = pd.read_sql_query(sql, self.corpus.conn, params=(ngram, ))
        return df

    def get_ngram_group_matrix(self, degree):
        my_type = self.ngram_prefixes[degree]
        try:
            ngm = self.corpus.get_table(
                'ngram{}doc_group_matrix'.format(my_type))
            ngm.set_index('ngram', inplace=True)
        except:
            mgm = pd.DataFrame()
        return ngm

    def get_pca_terms(self):
        try:
            df = self.corpus.get_table('pca_term', set_index='token_id')
            return df
        except:
            return None

    def get_pca_items(self):
        try:
            df = self.corpus.get_table('pca_item')  #, set_index='pc_id')
            df['label'] = df['pc_id'].apply(lambda x: 'PC{}'.format(x), 1)
            return df
        except:
            return None

    def get_pca_docs(self, n=1000):
        """Grab a random sample of n documents for plotting"""
        sql1 = "ATTACH '{}' AS m".format(self.model.dbfile)
        sql2 = """
        SELECT a.*, b.maxtopic 
        FROM pca_doc a
        JOIN m.doc b ON a.doc_id = b.src_doc_id
        ORDER BY RANDOM() LIMIT ?
        """
        try:
            self.corpus.conn.execute(sql1)
        except:
            print("Cant't attach topic model.")
            return None
        try:
            df = pd.read_sql_query(sql2,
                                   self.corpus.conn,
                                   params=(n, ),
                                   index_col='doc_id')
            print(df.head())
            return df
        except sqlite3.Error as e:
            print(e)
            print("Can't get PCA docs")
            return None

    def get_topic_comp_net(self):
        comps = self.corpus.get_table('pca_item', set_index=True)
        topics = self.model.get_table('topic', set_index=True)
        poles = pd.read_sql_query('SELECT * FROM topiccomp_pole',
                                  self.model.conn,
                                  index_col='pc_id')
        # poles = self.model.get_table('topiccomp_pole', set_index=True) # WTF
        corrs = self.model.get_table('topiccomp_corr', set_index=True)
        tids = set(
            pd.concat([poles.max_pos_topic_id, poles.max_neg_topic_id]).values)
        nodes = [{
            'id':
            t,
            'title':
            "{}".format(topics.loc[t].topic_words),
            'label':
            'T{} {}'.format(t, topics.loc[t].topic_gloss),
            'value':
            int((topics.loc[t].topic_alpha / topics.topic_alpha.max()) * 100)
        } for t in tids]
        edges = [{
            'from': poles.loc[p].max_pos_topic_id,
            'to': poles.loc[p].max_neg_topic_id,
            'value': comps.loc[p, 'explained_variance'],
            'label': "PC{}".format(p)
        } for p in poles.index]
        return nodes, edges

    def get_tsne_coords(self, join='left'):
        """Get the x and y values of the word_embeddings table to plot"""

        # todo: Convert into genuine error trap
        if join and join not in ('left', 'inner'):
            join = 'left'

        sql1 = """
        SELECT token_str, tsne_x, tsne_y, token_count, pc_id, ROUND(pc_weight * 1000) as pc_w 
        FROM word_embedding we 
        JOIN token t USING (token_str)
        {} JOIN (
            SELECT token_id, pc_id, pc_weight, MAX(ABS(pc_weight)) AS argmax
            FROM pca_term_narrow
            GROUP BY (token_id)
        ) pca USING (token_id)
        WHERE token_str IN (
            SELECT token_str 
            FROM token
            ORDER BY tfidf_sum DESC
            LIMIT 1000
        )
        """.format(join.upper())
        df = pd.read_sql_query(sql1, self.corpus.conn)
        df['token_norm_count'] = np.round(np.log2(
            df['token_count'])**1.2).astype('int')
        df['pc_id'] = df['pc_id'].fillna(-1).astype('int')
        df.loc[df.pc_w > 0, 'symbol'] = 0
        df.loc[df.pc_w <= 0, 'symbol'] = 1

        # Experiment: these argmax values should be computed ahead of time and
        # added to the VOCAB tables
        # sql2 = """
        # SELECT word_str as token_str, word_count as topic_token_count,
        #     topic_alpha, topic_gloss, topic_id, MAX(word_count) AS topic_argmax
        # FROM topicword tw
        # JOIN word w USING(word_id)
        # JOIN topic t USING(topic_id)
        # GROUP BY word_id
        # """
        # tw = pd.read_sql_query(sql2, self.model.conn)
        # df = df.merge(tw[['token_str', 'topic_token_count', 'topic_id',
        #                   'topic_alpha', 'topic_gloss']], on='token_str', how='left')

        sql3 = """
        SELECT word_str as token_str , topic_alpha, topic_gloss, topic_id
        FROM word w 
        JOIN topic t ON (w.maxtopic = t.topic_id)
        """
        tw = pd.read_sql_query(sql3, self.model.conn)
        df = df.merge(
            tw[['token_str', 'topic_id', 'topic_alpha', 'topic_gloss']],
            on='token_str',
            how='left')

        return df
Пример #13
0
class PoloReport():

    def __init__(self, config, trial_name='trial1'):

        # Set some values
        if trial_name not in config.trials:
            raise ValueError("Invalid trail name `{}`.format(trial)")
        self.config = config
        self.trial = trial_name
        self.slug = self.config.ini['DEFAULT']['slug']
        self.base_path = self.config.ini['DEFAULT']['base_path']
        self.thresh = float(self.config.ini['DEFAULT']['thresh'])

        # Load the databases
        corpus_db_file = self.config.generate_corpus_db_file_path()
        model_db_file = self.config.generate_model_db_file_path(self.trial)
        self.corpus = PoloDb(corpus_db_file)
        self.model = PoloDb(model_db_file)

    # EXPERIMENTAL
    def get_row_count(self, table):
        n = pd.read_sql_query('SELECT count(*) AS n FROM {}'.format(table), self.corpus.conn).n.tolist()[0]
        return n

    def get_doc_count(self):
        self.doc_count = pd.read_sql_query('SELECT count(*) AS n FROM doc', self.corpus.conn).n.tolist()[0]
        return self.doc_count

    def get_topic_count(self):
        self.topic_count = pd.read_sql_query('SELECT count(*) AS n FROM topic', self.corpus.conn).n.tolist()[0]
        return self.topic_count

    def get_topic_list(self, by_alpha = True):
        topics = self.model.get_table('topic', set_index=True)
        alpha_max = topics.topic_alpha.max()
        alpha_min = topics.topic_alpha.min()

        from scipy import stats
        topics['topic_alpha_zscore'] = stats.zscore(topics.topic_alpha)

        num_topics = len(topics.index)
        sql = "SELECT topic_id, GROUP_CONCAT(topic_phrase, ', ') as phrases FROM topicphrase " \
              "GROUP BY topic_id ORDER BY phrase_weight DESC"
        phrases = pd.read_sql_query(sql, self.model.conn)
        phrases.set_index('topic_id', inplace=True)
        cards = []
        if by_alpha:
            topic_id_list = topics.topic_alpha.sort_values(ascending=False).index.tolist()
        else:
            topic_id_list = range(num_topics)
        for topic_id in topic_id_list:
            card = dict(
                topic_id = topic_id,
                topic_alpha = topics.loc[topic_id].topic_alpha,
                topic_alpha_zscore = topics.loc[topic_id].topic_alpha_zscore,
                topic_phrases = phrases.loc[topic_id].phrases,
                topic_words = topics.loc[topic_id].topic_words
            )
            cards.append(card)
        return cards

    def get_prhases_for_topic(self, topic_id):
        sql = "SELECT topic_phrase FROM topicphrase WHERE topic_id = {} ORDER BY phrase_weight DESC".format(topic_id)
        phrases = ', '.join(pd.read_sql_query(sql, self.model.conn).topic_phrase.tolist())
        return phrases

    def display_topic_list(self, by_alpha=True):
        topic_list = self.get_topic_list(by_alpha)
        df = pd.DataFrame(topic_list)
        return df.to_html()

    """
Пример #14
0
class PoloRetro:
    def __init__(self, config):
        self.config = config
        self.corpus = None
        self.model = None
        self.retro = None

    # todo: Rewrite as PoloCombiner or something and make this the init
    def retro_combine(self, corpus_dbfile, model_dbfile, retro_dbfile=None):
        self.corpus = PoloDb(corpus_dbfile)
        self.model = PoloDb(model_dbfile)
        if retro_dbfile == None:
            retro_dbfile = '{}-retro-combo.db'.format(
                self.config.ini['DEFAULT']['slug'])
        self.retro = PoloDb(retro_dbfile)
        self.create_retro_db()

    def create_all_tables(self):
        self.create_config_table()
        self.create_src_doc_meta_table()
        self.create_src_doc_table()
        self.create_word_table()
        self.create_doc_table()
        self.create_docword_table()
        self.create_topic_table()
        self.create_doctopic_table()
        self.create_doctopic_long_table()
        self.create_topicword_table()
        self.create_topicword_long_table()
        self.create_topicphrase_table()
        self.create_topicpair_table()
        self.create_topicpair_by_deps_table()
        #self.create_doctopic_sig_table()

    def create_doc_table(self):
        doc = self.model.get_table('doc')
        src_doc = self.corpus.get_table('doc')
        new_doc = pd.DataFrame(columns=['doc_id', 'doc_label', 'doc_str'])
        new_doc['doc_id'] = doc['doc_id']
        doc.set_index('doc_id', inplace=True)
        src_doc.set_index('doc_id', inplace=True)
        new_doc.set_index('doc_id', inplace=True)
        new_doc['doc_label'] = doc.doc_label
        new_doc['doc_str'] = src_doc.doc_content
        self.retro.put_table(new_doc, 'doc', if_exists='replace', index=True)

    def create_src_doc_table(self):
        src_doc = self.corpus.get_table('doc')
        src_doc.set_index('doc_id', inplace=True)
        new_src_doc = pd.DataFrame(
            columns=
            'src_meta_id doc_id doc_title doc_uri doc_label doc_ord doc_content doc_original doc_year doc_date doc_citation'
            .split())
        new_src_doc['doc_id'] = src_doc.index
        new_src_doc.set_index('doc_id', inplace=True)
        new_src_doc['doc_title'] = src_doc.doc_title
        new_src_doc['doc_uri'] = src_doc.doc_key
        new_src_doc['doc_uri'] = new_src_doc['doc_uri'].apply(
            lambda x: self.config.ini['DEFAULT']['src_base_url'] + str(x))
        new_src_doc['doc_label'] = src_doc.doc_label
        new_src_doc['doc_ord'] = None
        new_src_doc['doc_content'] = src_doc.doc_content
        new_src_doc['doc_original'] = src_doc.doc_original
        if 'doc_year' in src_doc.columns:
            new_src_doc['doc_year'] = src_doc.doc_year
        if 'doc_date' in src_doc.columns:
            new_src_doc['doc_date'] = src_doc.doc_date
        new_src_doc['doc_citation'] = None
        self.retro.put_table(new_src_doc,
                             'src_doc',
                             if_exists='replace',
                             index=True)

    def create_src_doc_meta_table(self):
        src_doc_meta = pd.DataFrame(
            {
                'src_meta_id': self.config.ini['DEFAULT']['slug'],
                'src_meta_desc': self.config.ini['DEFAULT']['title'],
                'src_meta_base_url':
                self.config.ini['DEFAULT']['src_base_url'],
                'src_meta_ord_type': None
            },
            index=['src_meta_id'
                   ])  # fixme: Need to add ord type to config and pass it
        self.retro.put_table(src_doc_meta, 'src_doc_meta', if_exists='replace')

    def create_word_table(self):
        word = self.corpus.get_table('token')
        new_word = pd.DataFrame(
            columns='word_id word_str word_freq word_stem'.split())
        new_word['word_id'] = word.index
        new_word.set_index('word_id', inplace=True)
        new_word['word_str'] = word.token_str
        new_word['word_freq'] = word.token_count
        new_word['word_stem'] = None
        self.retro.put_table(new_word, 'word', if_exists='replace', index=True)

    def create_docword_table(self):
        sql = "SELECT dt.doc_id, t.ROWID as 'word_id', t.token_str as 'word_str', t.token_count as 'word_count', NULL as 'tfidf_weight' " \
              "FROM doctoken dt JOIN token t USING(token_str)"
        new_docword = pd.read_sql_query(sql, self.corpus.conn)
        self.retro.put_table(new_docword, 'docword', if_exists='replace')

    def create_config_table(self):
        config = self.model.get_table('config')
        self.retro.put_table(config, 'config', if_exists='replace')

    def create_doctopic_table(self):
        doctopic = self.model.get_table('doctopic')
        doctopic['topic_label'] = doctopic['topic_id'].apply(
            lambda x: 't{}'.format(x))
        doctopic = doctopic[['doc_id', 'topic_label', 'topic_weight']]
        doctopic.set_index(['doc_id', 'topic_label'], inplace=True)
        doctopic_wide = doctopic.unstack().reset_index()
        doctopic_wide.columns = doctopic_wide.columns.droplevel(0)
        doctopic_wide.rename(columns={'': 'doc_id'}, inplace=True)
        doc = self.model.get_table('doc')
        doc.set_index('doc_id', inplace=True)
        doctopic_wide = doctopic_wide.join(doc[['topic_entropy', 'doc_label']],
                                           how='left')
        self.retro.put_table(doctopic_wide, 'doctopic', if_exists='replace')

    def create_topic_table(self):
        topic = self.model.get_table('topic')
        new_topic = pd.DataFrame(
            columns='topic_id topic_alpha total_tokens topic_words'.split())
        new_topic['topic_id'] = topic.topic_id
        new_topic['topic_alpha'] = topic.topic_alpha
        new_topic['topic_words'] = topic.topic_words
        new_topic['total_tokens'] = topic.topic_tokens
        self.retro.put_table(new_topic, 'topic', if_exists='replace')

    def create_topicphrase_table(self):
        topicphrase = self.model.get_table('topicphrase')
        self.retro.put_table(topicphrase, 'topicphrase', if_exists='replace')

    def create_topicword_table(self):
        topicword = self.model.get_table('topicword')
        word = self.model.get_table('word')
        topicword['word_count'] = topicword['word_count'].astype(int)
        topicword['topic_label'] = topicword['topic_id'].apply(
            lambda x: 't{}'.format(x))
        topicword = topicword[['word_id', 'topic_label', 'word_count']]
        topicword.set_index(['word_id', 'topic_label'], inplace=True)
        topicword_wide = topicword.unstack().reset_index()
        topicword_wide.columns = topicword_wide.columns.droplevel(0)
        topicword_wide.rename(columns={'': 'word_id'}, inplace=True)
        topicword_wide.fillna(0, inplace=True)
        topicword_wide.set_index('word_id', inplace=True)
        word.set_index('word_id', inplace=True)
        topicword_wide['word_str'] = word.word_str
        self.retro.put_table(topicword_wide,
                             'topicword',
                             if_exists='replace',
                             index=True)

    def create_doctopic_long_table(self):
        doctopic = self.model.get_table('doctopic')
        self.retro.put_table(doctopic, 'doctopic_long', if_exists='replace')

    def create_topicword_long_table(self):
        topicword = self.model.get_table('topicword')
        word = self.model.get_table('word')
        topicword['word_count'] = topicword['word_count'].astype(int)
        word.set_index('word_id', inplace=True)
        topicword.set_index(['word_id', 'topic_id'], inplace=True)
        topicword = topicword.join(word, how='left')
        self.retro.put_table(topicword,
                             'topicword_long',
                             if_exists='replace',
                             index=True)

    def create_topicpair_table(self):
        topicpair = self.model.get_table('topicpair')
        new_tp = pd.DataFrame(
            columns='topic_id1 topic_id2 cosine_sim js_div'.split())
        new_tp['topic_id1'] = topicpair.topic_a_id
        new_tp['topic_id2'] = topicpair.topic_b_id
        new_tp['cosine_sim'] = topicpair.cosim
        new_tp['js_div'] = topicpair.jsd
        self.retro.put_table(new_tp, 'topicpair', if_exists='replace')

    def create_topicpair_by_deps_table(self):
        topicpair = self.model.get_table('topicpair')
        topic = self.model.get_table('topic')
        topicpair = topicpair.merge(topic[['topic_id', 'topic_rel_freq']],
                                    left_on='topic_a_id',
                                    right_on='topic_id',
                                    how='inner')
        topicpair = topicpair.merge(topic[['topic_id', 'topic_rel_freq']],
                                    left_on='topic_b_id',
                                    right_on='topic_id',
                                    how='inner')
        new_tp = pd.DataFrame(
            columns='topic_a topic_b p_a p_b p_ab p_aGb p_bGa i_ab'.split())
        new_tp['topic_a'] = topicpair.topic_a_id
        new_tp['topic_b'] = topicpair.topic_b_id
        new_tp['p_a'] = topicpair.topic_rel_freq_x
        new_tp['p_b'] = topicpair.topic_rel_freq_y
        new_tp['p_ab'] = topicpair.p_ab
        new_tp['p_aGb'] = topicpair.p_aGb
        new_tp['p_bGa'] = topicpair.p_bGa
        new_tp['i_ab'] = topicpair.i_ab
        self.retro.put_table(new_tp, 'topicpair_by_deps')

    def create_doctopic_sig_table(self):
        pass

    # fixme: The sql for tables with topics for columns need to be generated!
    def create_retro_db(self):
        sql_creators = """
        CREATE TABLE IF NOT EXISTS src_doc_meta (src_meta_id TEXT,src_meta_desc TEXT,src_meta_base_url TEXT,src_meta_ord_type TEXT);
        CREATE TABLE IF NOT EXISTS src_doc (src_meta_id TEXT,doc_id INTEGER PRIMARY KEY,doc_title TEXT,doc_uri TEXT UNIQUE,doc_label TEXT,doc_ord INTEGER,doc_content TEXT,doc_original TEXT,doc_year INTEGER,doc_date TEXT,doc_citation TEXT);
        CREATE TABLE IF NOT EXISTS word (word_id INTEGER PRIMARY KEY,word_str TEXT,word_freq INTEGER,word_stem TEXT);
        CREATE TABLE IF NOT EXISTS doc (doc_id INTEGER PRIMARY KEY,doc_label TEXT,doc_str TEXT);
        CREATE TABLE IF NOT EXISTS docword (doc_id INTEGER,word_id INTEGER,word_str TEXT,word_count INTEGER,tfidf_weight REAL);
        CREATE TABLE IF NOT EXISTS config (key TEXT, value TEXT);
        CREATE TABLE IF NOT EXISTS topic (topic_id INTEGER PRIMARY KEY, topic_alpha REAL, total_tokens INTEGER, topic_words TEXT);
        CREATE TABLE IF NOT EXISTS topicphrase (topic_id INTEGER, topic_phrase TEXT, phrase_count INTEGER, phrase_weight REAL);
        CREATE TABLE IF NOT EXISTS doctopic_long (doc_id INTEGER NOT NULL, topic_id INTEGER NOT NULL, topic_weight REAL NOT NULL, UNIQUE (doc_id, topic_id));
        CREATE TABLE IF NOT EXISTS topicword_long (word_id INTEGER NOT NULL, word_str TEXT NOT NULL, topic_id INTEGER NOT NULL, word_count INTEGER NOT NULL, UNIQUE (word_id, topic_id));
        CREATE TABLE IF NOT EXISTS topicpair (topic_id1 INTEGER, topic_id2 INTEGER, cosine_sim REAL, js_div REAL);
        CREATE TABLE IF NOT EXISTS topicpair_by_deps (topic_a INTEGER, topic_b INTEGER, p_a REAL, p_b REAL, p_ab REAL, p_aGb REAL, p_bGa REAL, i_ab REAL);
        CREATE TABLE IF NOT EXISTS doctopic_sig (doc_id INTEGER PRIMARY KEY, topic_sig TEXT, topic_sig_sorted TEXT, topic_n INTEGER);
        """.split(';')

        # Handle wide tables
        topic = self.model.get_table('topic')
        n_topics = len(topic.topic_id.tolist())
        topic_fields_real = ','.join(
            ['t{} REAL'.format(tn) for tn in range(n_topics)])
        topic_fields_int = ','.join(
            ['t{} INTEGER'.format(tn) for tn in range(n_topics)])
        sql_creators.append(
            "CREATE TABLE IF NOT EXISTS doctopic (doc_id INTEGER PRIMARY KEY, doc_label TEXT, topic_entropy REAL, {})"
            .format(topic_fields_real))
        sql_creators.append(
            "CREATE TABLE IF NOT EXISTS topicword (word_id INTEGER, word_str TEXT, {})"
            .format(topic_fields_int))

        for sql_create in sql_creators:
            self.retro.conn.execute(sql_create)
Пример #15
0
 def __init__(self, conifg, corpus_dbfile=None):
     self.gs_corpus = None
     self.gs_dict = None
     self.db = PoloDb(corpus_dbfile)