示例#1
0
def themes(eids, algo='agglomorative'):
    logger.info("Themes")
    with session() as sess:
        # use Model to decrypt fields
        E, CE = M.Entry, M.CacheEntry
        res = sess.query(CE) \
            .with_entities(CE.paras, CE.clean, CE.vectors) \
            .join(E, E.id == CE.entry_id) \
            .filter(E.id.in_(eids), func.array_length(CE.vectors,1)>0) \
            .order_by(E.created_at.desc()) \
            .all()
    # assert len(eids) == len(res)
    entries = pd.Series([e for r in res for e in r.paras])
    stripped = pd.Series([c for r in res for c in r.clean])
    vecs = np.vstack([r.vectors for r in res]).astype(np.float32)

    chain = Similars(vecs)
    if False and os.path.exists(vars.AE_PATH):
        chain = chain.autoencode(filename=vars.AE_PATH).cluster(algo=algo)
    else:
        chain = chain.normalize().cluster(algo=algo)
    clusters = chain.value()
    labels = chain.data.labels

    topics = []
    for l, center in enumerate(clusters):
        mask = labels == l
        n_entries = mask.sum().item()
        print('n_entries', n_entries)
        if n_entries < 2:
            print('skipping')
            continue

        vecs_, stripped_, entries_ = vecs[mask], stripped[mask], entries[mask]

        dists = Similars(center,
                         vecs_).normalize().cosine(abs=True).value().squeeze()
        entries_ = entries_.iloc[dists.argsort()].tolist()[:5]

        terms = top_terms(stripped_.tolist())
        topics.append({
            'n_entries': n_entries,
            'terms': terms,
            'summary': entries_,  # add full thing, will batch-compute next
            'sentiment': None,
        })

    groups = [t['summary'] for t in topics]
    batch_summaries = nlp_.summarization(groups, min_length=50, max_length=300)
    for i, res in enumerate(batch_summaries):
        print(res)
        topics[i]['summary'] = res['summary']
        topics[i]['sentiment'] = res['sentiment']

    topics = {'terms': top_terms(stripped, 10), 'themes': topics}

    return topics
示例#2
0
文件: themes.py 项目: tullur/gnothi
def themes(eids, algo='agglomorative'):
    with session() as sess:
        # use Model to decrypt fields
        res = sess.query(M.CacheEntry)\
            .with_entities(M.CacheEntry.paras, M.CacheEntry.clean, M.CacheEntry.vectors)\
            .join(M.Entry, M.Entry.id == M.CacheEntry.entry_id)\
            .filter(M.Entry.id.in_(eids))\
            .order_by(M.Entry.created_at.desc())\
            .all()
    # assert len(eids) == len(res)
    entries = pd.Series([e for r in res for e in r.paras])
    stripped = pd.Series([c for r in res for c in r.clean])
    vecs = []
    for r in res:
        if r.vectors: vecs += r.vectors
    # if not vecs: return False  # TODO somethign else to return?
    vecs = np.vstack(vecs).astype(np.float32)

    clusters = Similars(vecs).normalize().cluster(algo=algo).value()

    topics = []
    for l in range(clusters.max()):
        in_clust = clusters == l
        n_entries = in_clust.sum().item()
        print('n_entries', n_entries)
        if n_entries < 2:
            print('skipping')
            continue

        vecs_, stripped_, entries_ = vecs[in_clust],\
            stripped.iloc[in_clust], entries.iloc[in_clust]

        center = vecs_.mean(axis=0)[np.newaxis, :]
        dists = Similars(center, vecs_).normalize().cosine().value().squeeze()
        entries_ = entries_.iloc[dists.argsort()].tolist()[:5]

        terms = top_terms(stripped_.tolist())
        topics.append({
            'n_entries': n_entries,
            'terms': terms,
            'summary': entries_,  # add full thing, will batch-compute next
            'sentiment': None,
        })

    groups = [t['summary'] for t in topics]
    batch_summaries = nlp_.summarization(groups, min_length=50, max_length=300)
    for i, res in enumerate(batch_summaries):
        print(res)
        topics[i]['summary'] = res['summary']
        topics[i]['sentiment'] = res['sentiment']

    topics = {'terms': top_terms(stripped, 10), 'themes': topics}

    return topics
示例#3
0
def nlp_on_rows_(method, id, job_id, sess, uids):
    """
    Generate cache_{method} for all missing rows for this user. Eg, for entries we generate
    BERT,paras,clean for the entry specified by id; but also fill in any missing entries in the process.
    Then it checks if any users have missing rows (recursive call) and runs for them too.
    """

    # keep job alive, in case we're recursing and it's taking long
    if job_id is not None:
        sess.execute(text(f"""
        update jobs set updated_at={utcnow} where id=:job_id
        """), dict(job_id=str(job_id)))
        sess.commit()

    for_entries = method == 'entries'  # else is_profile
    m = M.Entry if for_entries else M.User
    # TODO id ignored for now, just grab everything that needs updating
    if for_entries:
        # main_entry = sess.query(m).get(id)
        rows_ = sess.query(m).filter(
            m.text.isnot(None),
            m.no_ai.isnot(True),
            m.ai_ran.isnot(True),
            # m.user_id == main_entry.user_id
        ).limit(10)
    else:
        rows_ = sess.query(m).filter(
            m.bio.isnot(None),
            m.ai_ran.isnot(True)
        ).limit(10)
    rows_ = rows_.all()
    # finished recursing
    if not rows_: return uids

    rows = []
    paras_grouped = []
    for r in rows_:
        txt = r.text if for_entries \
            else r.bio  # r.profile_to_text()  # TODO profile_to_text adds people
        paras = CleanText([txt]).markdown_split_paragraphs().value()
        if not paras:
            # Set everything with not-enough-content to ai_ran, and skip
            if for_entries:
                r.title_summary = r.text_summary = r.sentiment = None
            r.ai_ran = True
            sess.commit()
            continue
        uids.add(r.user_id if for_entries else r.id)
        rows.append(r)
        paras_grouped.append(paras)

    # Everything was too-short of content, nothing to do now.
    if not rows: return uids

    paras_flat = [p for paras in paras_grouped for p in paras]

    fkeys = [r.title for r in rows] \
        if for_entries else [r.email for r in rows]
    fixt = fixtures.load_nlp_rows(fkeys, method=method)
    if fixt:
        if for_entries:
            embeds, titles, texts = fixt
        else:
            embeds = fixt
    else:
        # embeds = Similars(paras_flat).embed().autoencode(save_load_path=vars.AE_PATH).value()
        embeds = nlp_.sentence_encode(paras_flat).tolist()
        if for_entries:
            titles = nlp_.summarization(paras_grouped, min_length=5, max_length=20, with_sentiment=False)
            texts = nlp_.summarization(paras_grouped, min_length=30, max_length=250)

    upserts = []
    for i, r in enumerate(rows):
        # Save the cache_entry (paras,clean,vectors)
        paras = paras_grouped[i]
        ct = len(paras)
        id_key = {'entries': 'entry_id', 'profiles': 'user_id'}[method]
        upserts.append({
            id_key: r.id,
            'paras': paras,
            'vectors': embeds[:ct],
        })

        # Save the fixture for later
        fixt = (embeds[:ct], titles[i], texts[i]) \
            if for_entries else (embeds[:ct],)
        fixt_k = r.title if for_entries else r.email
        fixtures.save_nlp_row(fixt_k, fixt, method=method)

        embeds = embeds[ct:]

        if for_entries:
            r.title_summary = titles[i]["summary"]
            r.text_summary = texts[i]["summary"]
            r.sentiment = texts[i]["sentiment"]
        r.ai_ran = True
        # sess.commit()  # deferring till later, so queue up writes for perf

    m = M.CacheEntry if for_entries else M.CacheUser
    insert = postgresql.insert(m.__table__).values(upserts)
    sess.execute(insert.on_conflict_do_update(
        constraint=m.__table__.primary_key,
        set_=dict(paras=insert.excluded.paras, vectors=insert.excluded.vectors)
    ))
    sess.commit()

    # recurse to handle any other users with missing cache_{method}. Real return (empty object)
    # handled at top.
    return nlp_on_rows_(method, None, job_id, sess, uids)
示例#4
0
def nlp_on_rows(method='entries'):
    for_entries = method == 'entries'  # else is_profile
    with session() as sess:
        if for_entries:
            rows = sess.query(M.Entry) \
                .filter(
                    func.length(M.Entry.text) > 64,
                    M.Entry.no_ai.isnot(True),
                    M.Entry.ai_ran.isnot(True)
                )
        else:
            rows = sess.query(M.User) \
                .filter(
                    func.length(M.User.bio) > 32,
                    M.User.ai_ran.isnot(True)
                )
        rows = rows.all()
        if not rows: return {}

        paras_grouped = []
        uids = set()
        for r in rows:
            txt = r.text if for_entries \
                else r.bio  # r.profile_to_text()  # TODO profile_to_text adds people
            paras_grouped.append(cleantext.markdown_split_paragraphs([txt]))
            if for_entries:
                uids.add(r.user_id)
        paras_flat = [p for paras in paras_grouped for p in paras]

        fkeys = [r.title for r in rows] \
            if for_entries else [r.email for r in rows]
        fixt = fixtures.load_nlp_rows(fkeys, method=method)
        if fixt:
            if for_entries:
                clean_txt, embeds, titles, texts = fixt
            else:
                clean_txt, embeds = fixt
        else:
            clean_txt = cleantext.keywords(
                paras_flat, postags=['NOUN', 'ADJ', 'VERB', 'PROPN'])
            embeds = nlp_.sentence_encode(paras_flat).tolist()
            if for_entries:
                titles = nlp_.summarization(paras_grouped,
                                            min_length=5,
                                            max_length=20,
                                            with_sentiment=False)
                texts = nlp_.summarization(paras_grouped,
                                           min_length=30,
                                           max_length=250)

        for i, r in enumerate(rows):
            CM = M.CacheEntry if for_entries else M.CacheUser
            c = sess.query(CM).get(r.id)
            if not c:
                c = CM(entry_id=r.id) if for_entries else CM(user_id=r.id)
                sess.add(c)
            # Save the cache_entry (paras,clean,vectors)
            paras = paras_grouped[i]
            c.paras = paras
            ct = len(paras)
            c.clean = [' '.join(e) for e in clean_txt[:ct]]
            c.vectors = embeds[:ct]
            sess.commit()

            # Save the fixture for later
            fixt = (clean_txt[:ct], embeds[:ct], titles[i], texts[i]) \
                if for_entries else (clean_txt[:ct], embeds[:ct])
            fixt_k = r.title if for_entries else r.email
            fixtures.save_nlp_row(fixt_k, fixt, method=method)

            clean_txt, embeds = clean_txt[ct:], embeds[ct:]

            if for_entries:
                r.title_summary = titles[i]["summary"]
                r.text_summary = texts[i]["summary"]
                r.sentiment = texts[i]["sentiment"]
            r.ai_ran = True
            sess.commit()

        if for_entries:
            # 9131155e: only update every x entries
            M.Job.multiple_book_jobs(list(uids))
    return {}