예제 #1
0
class PoloGensim:
    def __init__(self, conifg, corpus_dbfile=None):
        self.gs_corpus = None
        self.gs_dict = None
        self.db = PoloDb(corpus_dbfile)

    def make_gs_corpus(self):
        doctokenbow = self.db.get_table('doctokenbow')
        doctokenbow.set_index('doc_id', inplace=True)
        self.gs_corpus = [[
            (row[0], row[1])
            for row in doctokenbow.loc[doc_id == doc_id,
                                       ['token_id', 'token_count']].values
        ] for doc_id in doctokenbow.index.unique()]
        """
        # Old school loop way
        self.gs_corpus = []
        for doc_id in doctokenbow.index.unique():
            doc = []
            for row in doctokenbow.loc[doc_id, ['token_id', 'token_count']].values:
                doc.append((row[0], row[1]))
            self.gs_corpus.append(doc)
        """

    def make_gs_dict(self):
        token = self.db.get_table('token')
        self.gs_dict = {
            row[0]: row[1]
            for row in token[['token_id', 'token_str']].values
        }

    def get_hdp(self):
        hdp = models.HdpModel(self.gs_corpus, self.gs_dict)
        hdp_topics = hdp.get_topics()
        hdp_df = pd.DataFrame(hdp_topics)
        hdp_dfn = pd.DataFrame(hdp_df.unstack())
        hdp_dfn.reset_index(inplace=True)
        hdp_dfn.columns = ['token_id', 'topic_id', 'token_freq']
        self.db.put_table(hdp_dfn, 'hdp', if_exists='replace')

        # todo: Go the next step and extract topic with word with freqs above a thresh
        thresh = 0.0005
        # Sometimes it's easier to use SQL than to figure out how to something
        # like this in Pandas
        sql = """
        SELECT topic_id, GROUP_CONCAT(token_str, ' ') AS top_words
        FROM ( SELECT topic_id, token_id FROM hdp WHERE token_freq > {} ORDER BY topic_id, token_freq DESC )
        JOIN token USING (token_id)
        GROUP BY topic_id
        """.format(thresh)
        hdp_topics = pd.read_sql_query(sql, self.db.conn)
        self.db.put_table(hdp_topics, 'hdp_topics')

        thresh = 0.005  # Note this is different from what's in config.ini
예제 #2
0
class PoloRetro:
    def __init__(self, config):
        self.config = config
        self.corpus = None
        self.model = None
        self.retro = None

    # todo: Rewrite as PoloCombiner or something and make this the init
    def retro_combine(self, corpus_dbfile, model_dbfile, retro_dbfile=None):
        self.corpus = PoloDb(corpus_dbfile)
        self.model = PoloDb(model_dbfile)
        if retro_dbfile == None:
            retro_dbfile = '{}-retro-combo.db'.format(
                self.config.ini['DEFAULT']['slug'])
        self.retro = PoloDb(retro_dbfile)
        self.create_retro_db()

    def create_all_tables(self):
        self.create_config_table()
        self.create_src_doc_meta_table()
        self.create_src_doc_table()
        self.create_word_table()
        self.create_doc_table()
        self.create_docword_table()
        self.create_topic_table()
        self.create_doctopic_table()
        self.create_doctopic_long_table()
        self.create_topicword_table()
        self.create_topicword_long_table()
        self.create_topicphrase_table()
        self.create_topicpair_table()
        self.create_topicpair_by_deps_table()
        #self.create_doctopic_sig_table()

    def create_doc_table(self):
        doc = self.model.get_table('doc')
        src_doc = self.corpus.get_table('doc')
        new_doc = pd.DataFrame(columns=['doc_id', 'doc_label', 'doc_str'])
        new_doc['doc_id'] = doc['doc_id']
        doc.set_index('doc_id', inplace=True)
        src_doc.set_index('doc_id', inplace=True)
        new_doc.set_index('doc_id', inplace=True)
        new_doc['doc_label'] = doc.doc_label
        new_doc['doc_str'] = src_doc.doc_content
        self.retro.put_table(new_doc, 'doc', if_exists='replace', index=True)

    def create_src_doc_table(self):
        src_doc = self.corpus.get_table('doc')
        src_doc.set_index('doc_id', inplace=True)
        new_src_doc = pd.DataFrame(
            columns=
            'src_meta_id doc_id doc_title doc_uri doc_label doc_ord doc_content doc_original doc_year doc_date doc_citation'
            .split())
        new_src_doc['doc_id'] = src_doc.index
        new_src_doc.set_index('doc_id', inplace=True)
        new_src_doc['doc_title'] = src_doc.doc_title
        new_src_doc['doc_uri'] = src_doc.doc_key
        new_src_doc['doc_uri'] = new_src_doc['doc_uri'].apply(
            lambda x: self.config.ini['DEFAULT']['src_base_url'] + str(x))
        new_src_doc['doc_label'] = src_doc.doc_label
        new_src_doc['doc_ord'] = None
        new_src_doc['doc_content'] = src_doc.doc_content
        new_src_doc['doc_original'] = src_doc.doc_original
        if 'doc_year' in src_doc.columns:
            new_src_doc['doc_year'] = src_doc.doc_year
        if 'doc_date' in src_doc.columns:
            new_src_doc['doc_date'] = src_doc.doc_date
        new_src_doc['doc_citation'] = None
        self.retro.put_table(new_src_doc,
                             'src_doc',
                             if_exists='replace',
                             index=True)

    def create_src_doc_meta_table(self):
        src_doc_meta = pd.DataFrame(
            {
                'src_meta_id': self.config.ini['DEFAULT']['slug'],
                'src_meta_desc': self.config.ini['DEFAULT']['title'],
                'src_meta_base_url':
                self.config.ini['DEFAULT']['src_base_url'],
                'src_meta_ord_type': None
            },
            index=['src_meta_id'
                   ])  # fixme: Need to add ord type to config and pass it
        self.retro.put_table(src_doc_meta, 'src_doc_meta', if_exists='replace')

    def create_word_table(self):
        word = self.corpus.get_table('token')
        new_word = pd.DataFrame(
            columns='word_id word_str word_freq word_stem'.split())
        new_word['word_id'] = word.index
        new_word.set_index('word_id', inplace=True)
        new_word['word_str'] = word.token_str
        new_word['word_freq'] = word.token_count
        new_word['word_stem'] = None
        self.retro.put_table(new_word, 'word', if_exists='replace', index=True)

    def create_docword_table(self):
        sql = "SELECT dt.doc_id, t.ROWID as 'word_id', t.token_str as 'word_str', t.token_count as 'word_count', NULL as 'tfidf_weight' " \
              "FROM doctoken dt JOIN token t USING(token_str)"
        new_docword = pd.read_sql_query(sql, self.corpus.conn)
        self.retro.put_table(new_docword, 'docword', if_exists='replace')

    def create_config_table(self):
        config = self.model.get_table('config')
        self.retro.put_table(config, 'config', if_exists='replace')

    def create_doctopic_table(self):
        doctopic = self.model.get_table('doctopic')
        doctopic['topic_label'] = doctopic['topic_id'].apply(
            lambda x: 't{}'.format(x))
        doctopic = doctopic[['doc_id', 'topic_label', 'topic_weight']]
        doctopic.set_index(['doc_id', 'topic_label'], inplace=True)
        doctopic_wide = doctopic.unstack().reset_index()
        doctopic_wide.columns = doctopic_wide.columns.droplevel(0)
        doctopic_wide.rename(columns={'': 'doc_id'}, inplace=True)
        doc = self.model.get_table('doc')
        doc.set_index('doc_id', inplace=True)
        doctopic_wide = doctopic_wide.join(doc[['topic_entropy', 'doc_label']],
                                           how='left')
        self.retro.put_table(doctopic_wide, 'doctopic', if_exists='replace')

    def create_topic_table(self):
        topic = self.model.get_table('topic')
        new_topic = pd.DataFrame(
            columns='topic_id topic_alpha total_tokens topic_words'.split())
        new_topic['topic_id'] = topic.topic_id
        new_topic['topic_alpha'] = topic.topic_alpha
        new_topic['topic_words'] = topic.topic_words
        new_topic['total_tokens'] = topic.topic_tokens
        self.retro.put_table(new_topic, 'topic', if_exists='replace')

    def create_topicphrase_table(self):
        topicphrase = self.model.get_table('topicphrase')
        self.retro.put_table(topicphrase, 'topicphrase', if_exists='replace')

    def create_topicword_table(self):
        topicword = self.model.get_table('topicword')
        word = self.model.get_table('word')
        topicword['word_count'] = topicword['word_count'].astype(int)
        topicword['topic_label'] = topicword['topic_id'].apply(
            lambda x: 't{}'.format(x))
        topicword = topicword[['word_id', 'topic_label', 'word_count']]
        topicword.set_index(['word_id', 'topic_label'], inplace=True)
        topicword_wide = topicword.unstack().reset_index()
        topicword_wide.columns = topicword_wide.columns.droplevel(0)
        topicword_wide.rename(columns={'': 'word_id'}, inplace=True)
        topicword_wide.fillna(0, inplace=True)
        topicword_wide.set_index('word_id', inplace=True)
        word.set_index('word_id', inplace=True)
        topicword_wide['word_str'] = word.word_str
        self.retro.put_table(topicword_wide,
                             'topicword',
                             if_exists='replace',
                             index=True)

    def create_doctopic_long_table(self):
        doctopic = self.model.get_table('doctopic')
        self.retro.put_table(doctopic, 'doctopic_long', if_exists='replace')

    def create_topicword_long_table(self):
        topicword = self.model.get_table('topicword')
        word = self.model.get_table('word')
        topicword['word_count'] = topicword['word_count'].astype(int)
        word.set_index('word_id', inplace=True)
        topicword.set_index(['word_id', 'topic_id'], inplace=True)
        topicword = topicword.join(word, how='left')
        self.retro.put_table(topicword,
                             'topicword_long',
                             if_exists='replace',
                             index=True)

    def create_topicpair_table(self):
        topicpair = self.model.get_table('topicpair')
        new_tp = pd.DataFrame(
            columns='topic_id1 topic_id2 cosine_sim js_div'.split())
        new_tp['topic_id1'] = topicpair.topic_a_id
        new_tp['topic_id2'] = topicpair.topic_b_id
        new_tp['cosine_sim'] = topicpair.cosim
        new_tp['js_div'] = topicpair.jsd
        self.retro.put_table(new_tp, 'topicpair', if_exists='replace')

    def create_topicpair_by_deps_table(self):
        topicpair = self.model.get_table('topicpair')
        topic = self.model.get_table('topic')
        topicpair = topicpair.merge(topic[['topic_id', 'topic_rel_freq']],
                                    left_on='topic_a_id',
                                    right_on='topic_id',
                                    how='inner')
        topicpair = topicpair.merge(topic[['topic_id', 'topic_rel_freq']],
                                    left_on='topic_b_id',
                                    right_on='topic_id',
                                    how='inner')
        new_tp = pd.DataFrame(
            columns='topic_a topic_b p_a p_b p_ab p_aGb p_bGa i_ab'.split())
        new_tp['topic_a'] = topicpair.topic_a_id
        new_tp['topic_b'] = topicpair.topic_b_id
        new_tp['p_a'] = topicpair.topic_rel_freq_x
        new_tp['p_b'] = topicpair.topic_rel_freq_y
        new_tp['p_ab'] = topicpair.p_ab
        new_tp['p_aGb'] = topicpair.p_aGb
        new_tp['p_bGa'] = topicpair.p_bGa
        new_tp['i_ab'] = topicpair.i_ab
        self.retro.put_table(new_tp, 'topicpair_by_deps')

    def create_doctopic_sig_table(self):
        pass

    # fixme: The sql for tables with topics for columns need to be generated!
    def create_retro_db(self):
        sql_creators = """
        CREATE TABLE IF NOT EXISTS src_doc_meta (src_meta_id TEXT,src_meta_desc TEXT,src_meta_base_url TEXT,src_meta_ord_type TEXT);
        CREATE TABLE IF NOT EXISTS src_doc (src_meta_id TEXT,doc_id INTEGER PRIMARY KEY,doc_title TEXT,doc_uri TEXT UNIQUE,doc_label TEXT,doc_ord INTEGER,doc_content TEXT,doc_original TEXT,doc_year INTEGER,doc_date TEXT,doc_citation TEXT);
        CREATE TABLE IF NOT EXISTS word (word_id INTEGER PRIMARY KEY,word_str TEXT,word_freq INTEGER,word_stem TEXT);
        CREATE TABLE IF NOT EXISTS doc (doc_id INTEGER PRIMARY KEY,doc_label TEXT,doc_str TEXT);
        CREATE TABLE IF NOT EXISTS docword (doc_id INTEGER,word_id INTEGER,word_str TEXT,word_count INTEGER,tfidf_weight REAL);
        CREATE TABLE IF NOT EXISTS config (key TEXT, value TEXT);
        CREATE TABLE IF NOT EXISTS topic (topic_id INTEGER PRIMARY KEY, topic_alpha REAL, total_tokens INTEGER, topic_words TEXT);
        CREATE TABLE IF NOT EXISTS topicphrase (topic_id INTEGER, topic_phrase TEXT, phrase_count INTEGER, phrase_weight REAL);
        CREATE TABLE IF NOT EXISTS doctopic_long (doc_id INTEGER NOT NULL, topic_id INTEGER NOT NULL, topic_weight REAL NOT NULL, UNIQUE (doc_id, topic_id));
        CREATE TABLE IF NOT EXISTS topicword_long (word_id INTEGER NOT NULL, word_str TEXT NOT NULL, topic_id INTEGER NOT NULL, word_count INTEGER NOT NULL, UNIQUE (word_id, topic_id));
        CREATE TABLE IF NOT EXISTS topicpair (topic_id1 INTEGER, topic_id2 INTEGER, cosine_sim REAL, js_div REAL);
        CREATE TABLE IF NOT EXISTS topicpair_by_deps (topic_a INTEGER, topic_b INTEGER, p_a REAL, p_b REAL, p_ab REAL, p_aGb REAL, p_bGa REAL, i_ab REAL);
        CREATE TABLE IF NOT EXISTS doctopic_sig (doc_id INTEGER PRIMARY KEY, topic_sig TEXT, topic_sig_sorted TEXT, topic_n INTEGER);
        """.split(';')

        # Handle wide tables
        topic = self.model.get_table('topic')
        n_topics = len(topic.topic_id.tolist())
        topic_fields_real = ','.join(
            ['t{} REAL'.format(tn) for tn in range(n_topics)])
        topic_fields_int = ','.join(
            ['t{} INTEGER'.format(tn) for tn in range(n_topics)])
        sql_creators.append(
            "CREATE TABLE IF NOT EXISTS doctopic (doc_id INTEGER PRIMARY KEY, doc_label TEXT, topic_entropy REAL, {})"
            .format(topic_fields_real))
        sql_creators.append(
            "CREATE TABLE IF NOT EXISTS topicword (word_id INTEGER, word_str TEXT, {})"
            .format(topic_fields_int))

        for sql_create in sql_creators:
            self.retro.conn.execute(sql_create)