def themes(eids, algo='agglomorative'): logger.info("Themes") with session() as sess: # use Model to decrypt fields E, CE = M.Entry, M.CacheEntry res = sess.query(CE) \ .with_entities(CE.paras, CE.clean, CE.vectors) \ .join(E, E.id == CE.entry_id) \ .filter(E.id.in_(eids), func.array_length(CE.vectors,1)>0) \ .order_by(E.created_at.desc()) \ .all() # assert len(eids) == len(res) entries = pd.Series([e for r in res for e in r.paras]) stripped = pd.Series([c for r in res for c in r.clean]) vecs = np.vstack([r.vectors for r in res]).astype(np.float32) chain = Similars(vecs) if False and os.path.exists(vars.AE_PATH): chain = chain.autoencode(filename=vars.AE_PATH).cluster(algo=algo) else: chain = chain.normalize().cluster(algo=algo) clusters = chain.value() labels = chain.data.labels topics = [] for l, center in enumerate(clusters): mask = labels == l n_entries = mask.sum().item() print('n_entries', n_entries) if n_entries < 2: print('skipping') continue vecs_, stripped_, entries_ = vecs[mask], stripped[mask], entries[mask] dists = Similars(center, vecs_).normalize().cosine(abs=True).value().squeeze() entries_ = entries_.iloc[dists.argsort()].tolist()[:5] terms = top_terms(stripped_.tolist()) topics.append({ 'n_entries': n_entries, 'terms': terms, 'summary': entries_, # add full thing, will batch-compute next 'sentiment': None, }) groups = [t['summary'] for t in topics] batch_summaries = nlp_.summarization(groups, min_length=50, max_length=300) for i, res in enumerate(batch_summaries): print(res) topics[i]['summary'] = res['summary'] topics[i]['sentiment'] = res['sentiment'] topics = {'terms': top_terms(stripped, 10), 'themes': topics} return topics
def themes(eids, algo='agglomorative'): with session() as sess: # use Model to decrypt fields res = sess.query(M.CacheEntry)\ .with_entities(M.CacheEntry.paras, M.CacheEntry.clean, M.CacheEntry.vectors)\ .join(M.Entry, M.Entry.id == M.CacheEntry.entry_id)\ .filter(M.Entry.id.in_(eids))\ .order_by(M.Entry.created_at.desc())\ .all() # assert len(eids) == len(res) entries = pd.Series([e for r in res for e in r.paras]) stripped = pd.Series([c for r in res for c in r.clean]) vecs = [] for r in res: if r.vectors: vecs += r.vectors # if not vecs: return False # TODO somethign else to return? vecs = np.vstack(vecs).astype(np.float32) clusters = Similars(vecs).normalize().cluster(algo=algo).value() topics = [] for l in range(clusters.max()): in_clust = clusters == l n_entries = in_clust.sum().item() print('n_entries', n_entries) if n_entries < 2: print('skipping') continue vecs_, stripped_, entries_ = vecs[in_clust],\ stripped.iloc[in_clust], entries.iloc[in_clust] center = vecs_.mean(axis=0)[np.newaxis, :] dists = Similars(center, vecs_).normalize().cosine().value().squeeze() entries_ = entries_.iloc[dists.argsort()].tolist()[:5] terms = top_terms(stripped_.tolist()) topics.append({ 'n_entries': n_entries, 'terms': terms, 'summary': entries_, # add full thing, will batch-compute next 'sentiment': None, }) groups = [t['summary'] for t in topics] batch_summaries = nlp_.summarization(groups, min_length=50, max_length=300) for i, res in enumerate(batch_summaries): print(res) topics[i]['summary'] = res['summary'] topics[i]['sentiment'] = res['sentiment'] topics = {'terms': top_terms(stripped, 10), 'themes': topics} return topics
def nlp_on_rows_(method, id, job_id, sess, uids): """ Generate cache_{method} for all missing rows for this user. Eg, for entries we generate BERT,paras,clean for the entry specified by id; but also fill in any missing entries in the process. Then it checks if any users have missing rows (recursive call) and runs for them too. """ # keep job alive, in case we're recursing and it's taking long if job_id is not None: sess.execute(text(f""" update jobs set updated_at={utcnow} where id=:job_id """), dict(job_id=str(job_id))) sess.commit() for_entries = method == 'entries' # else is_profile m = M.Entry if for_entries else M.User # TODO id ignored for now, just grab everything that needs updating if for_entries: # main_entry = sess.query(m).get(id) rows_ = sess.query(m).filter( m.text.isnot(None), m.no_ai.isnot(True), m.ai_ran.isnot(True), # m.user_id == main_entry.user_id ).limit(10) else: rows_ = sess.query(m).filter( m.bio.isnot(None), m.ai_ran.isnot(True) ).limit(10) rows_ = rows_.all() # finished recursing if not rows_: return uids rows = [] paras_grouped = [] for r in rows_: txt = r.text if for_entries \ else r.bio # r.profile_to_text() # TODO profile_to_text adds people paras = CleanText([txt]).markdown_split_paragraphs().value() if not paras: # Set everything with not-enough-content to ai_ran, and skip if for_entries: r.title_summary = r.text_summary = r.sentiment = None r.ai_ran = True sess.commit() continue uids.add(r.user_id if for_entries else r.id) rows.append(r) paras_grouped.append(paras) # Everything was too-short of content, nothing to do now. if not rows: return uids paras_flat = [p for paras in paras_grouped for p in paras] fkeys = [r.title for r in rows] \ if for_entries else [r.email for r in rows] fixt = fixtures.load_nlp_rows(fkeys, method=method) if fixt: if for_entries: embeds, titles, texts = fixt else: embeds = fixt else: # embeds = Similars(paras_flat).embed().autoencode(save_load_path=vars.AE_PATH).value() embeds = nlp_.sentence_encode(paras_flat).tolist() if for_entries: titles = nlp_.summarization(paras_grouped, min_length=5, max_length=20, with_sentiment=False) texts = nlp_.summarization(paras_grouped, min_length=30, max_length=250) upserts = [] for i, r in enumerate(rows): # Save the cache_entry (paras,clean,vectors) paras = paras_grouped[i] ct = len(paras) id_key = {'entries': 'entry_id', 'profiles': 'user_id'}[method] upserts.append({ id_key: r.id, 'paras': paras, 'vectors': embeds[:ct], }) # Save the fixture for later fixt = (embeds[:ct], titles[i], texts[i]) \ if for_entries else (embeds[:ct],) fixt_k = r.title if for_entries else r.email fixtures.save_nlp_row(fixt_k, fixt, method=method) embeds = embeds[ct:] if for_entries: r.title_summary = titles[i]["summary"] r.text_summary = texts[i]["summary"] r.sentiment = texts[i]["sentiment"] r.ai_ran = True # sess.commit() # deferring till later, so queue up writes for perf m = M.CacheEntry if for_entries else M.CacheUser insert = postgresql.insert(m.__table__).values(upserts) sess.execute(insert.on_conflict_do_update( constraint=m.__table__.primary_key, set_=dict(paras=insert.excluded.paras, vectors=insert.excluded.vectors) )) sess.commit() # recurse to handle any other users with missing cache_{method}. Real return (empty object) # handled at top. return nlp_on_rows_(method, None, job_id, sess, uids)
def nlp_on_rows(method='entries'): for_entries = method == 'entries' # else is_profile with session() as sess: if for_entries: rows = sess.query(M.Entry) \ .filter( func.length(M.Entry.text) > 64, M.Entry.no_ai.isnot(True), M.Entry.ai_ran.isnot(True) ) else: rows = sess.query(M.User) \ .filter( func.length(M.User.bio) > 32, M.User.ai_ran.isnot(True) ) rows = rows.all() if not rows: return {} paras_grouped = [] uids = set() for r in rows: txt = r.text if for_entries \ else r.bio # r.profile_to_text() # TODO profile_to_text adds people paras_grouped.append(cleantext.markdown_split_paragraphs([txt])) if for_entries: uids.add(r.user_id) paras_flat = [p for paras in paras_grouped for p in paras] fkeys = [r.title for r in rows] \ if for_entries else [r.email for r in rows] fixt = fixtures.load_nlp_rows(fkeys, method=method) if fixt: if for_entries: clean_txt, embeds, titles, texts = fixt else: clean_txt, embeds = fixt else: clean_txt = cleantext.keywords( paras_flat, postags=['NOUN', 'ADJ', 'VERB', 'PROPN']) embeds = nlp_.sentence_encode(paras_flat).tolist() if for_entries: titles = nlp_.summarization(paras_grouped, min_length=5, max_length=20, with_sentiment=False) texts = nlp_.summarization(paras_grouped, min_length=30, max_length=250) for i, r in enumerate(rows): CM = M.CacheEntry if for_entries else M.CacheUser c = sess.query(CM).get(r.id) if not c: c = CM(entry_id=r.id) if for_entries else CM(user_id=r.id) sess.add(c) # Save the cache_entry (paras,clean,vectors) paras = paras_grouped[i] c.paras = paras ct = len(paras) c.clean = [' '.join(e) for e in clean_txt[:ct]] c.vectors = embeds[:ct] sess.commit() # Save the fixture for later fixt = (clean_txt[:ct], embeds[:ct], titles[i], texts[i]) \ if for_entries else (clean_txt[:ct], embeds[:ct]) fixt_k = r.title if for_entries else r.email fixtures.save_nlp_row(fixt_k, fixt, method=method) clean_txt, embeds = clean_txt[ct:], embeds[ct:] if for_entries: r.title_summary = titles[i]["summary"] r.text_summary = texts[i]["summary"] r.sentiment = texts[i]["sentiment"] r.ai_ran = True sess.commit() if for_entries: # 9131155e: only update every x entries M.Job.multiple_book_jobs(list(uids)) return {}