def gen_keywords(for_entries, uid, sess): # finalize cache_entries by generating keywords. Need to do this last so # we can generate bigrams of full user corpus of paras (not per-entry) if for_entries: m, col = M.CacheEntry, M.CacheEntry.entry_id # generate bigrams within a user's entry-corpus rows = sess.query(m)\ .options(sa.orm.load_only(col, m.paras))\ .join(M.Entry, M.Entry.id == col)\ .filter(M.Entry.user_id==uid, sa.func.array_length(m.paras, 1)>0)\ .all() else: m, col = M.CacheUser, M.CacheUser.user_id # Generate bigrams across user profiles rows = sess.query(m)\ .options(sa.orm.load_only(col, m.paras))\ .filter(sa.func.array_length(m.paras, 1) > 0)\ .all() if not rows: return paras_flat = [p for row in rows for p in row.paras] keywords = CleanText(paras_flat) \ .keywords(postags=['NOUN', 'ADJ', 'VERB', 'PROPN'], mode='fast', bigram_min_count=2, bigram_threshold=2) \ .join().value() for r in rows: ct = len(r.paras) kw = keywords[:ct] keywords = keywords[ct:] # unshift for continuation r.clean = kw sess.commit()
def test_md_split_1(): doc = articles()[0] paras = CleanText(doc) \ .markdown_split_paragraphs() \ .value() assert len(paras) > 1 print(paras)
def test_md_split_all(): docs = articles() paras = CleanText(docs)\ .markdown_split_paragraphs()\ .value() assert len(paras) > 0 assert len(docs) < len(paras) print(paras)
def test_join(content): ndim, content = content v = CleanText(content).join().value() if ndim == 0: assert content == 'token1 token2 token3 token4' if ndim == 1: assert v == 'token1 token2 token3 token4' if ndim == 2: assert len(v) == 2 assert v[0] == 'token1 token2' assert v[1] == 'token3 token4'
def test_normalize(fmt, coverage, mode): chain = CleanText(articles(fmt=fmt)) if coverage == "basic": chain = chain.keywords(mode=mode) else: # Revisit this list as cleantext.py grows chain = chain\ .unmark()\ .strip_html()\ .normalize_numbers()\ .fix_punct()\ .only_english()\ .only_ascii()\ .remove_apos()\ .multiple_whitespace()\ .keywords(mode=mode) clean = chain.join().value() assert len(chain.data.lemmas) > 10 print(chain.data.lemmas[:5]) assert len(clean) > 10 print(clean[0])
def nlp_on_rows_(method, id, job_id, sess, uids): """ Generate cache_{method} for all missing rows for this user. Eg, for entries we generate BERT,paras,clean for the entry specified by id; but also fill in any missing entries in the process. Then it checks if any users have missing rows (recursive call) and runs for them too. """ # keep job alive, in case we're recursing and it's taking long if job_id is not None: sess.execute(text(f""" update jobs set updated_at={utcnow} where id=:job_id """), dict(job_id=str(job_id))) sess.commit() for_entries = method == 'entries' # else is_profile m = M.Entry if for_entries else M.User # TODO id ignored for now, just grab everything that needs updating if for_entries: # main_entry = sess.query(m).get(id) rows_ = sess.query(m).filter( m.text.isnot(None), m.no_ai.isnot(True), m.ai_ran.isnot(True), # m.user_id == main_entry.user_id ).limit(10) else: rows_ = sess.query(m).filter( m.bio.isnot(None), m.ai_ran.isnot(True) ).limit(10) rows_ = rows_.all() # finished recursing if not rows_: return uids rows = [] paras_grouped = [] for r in rows_: txt = r.text if for_entries \ else r.bio # r.profile_to_text() # TODO profile_to_text adds people paras = CleanText([txt]).markdown_split_paragraphs().value() if not paras: # Set everything with not-enough-content to ai_ran, and skip if for_entries: r.title_summary = r.text_summary = r.sentiment = None r.ai_ran = True sess.commit() continue uids.add(r.user_id if for_entries else r.id) rows.append(r) paras_grouped.append(paras) # Everything was too-short of content, nothing to do now. if not rows: return uids paras_flat = [p for paras in paras_grouped for p in paras] fkeys = [r.title for r in rows] \ if for_entries else [r.email for r in rows] fixt = fixtures.load_nlp_rows(fkeys, method=method) if fixt: if for_entries: embeds, titles, texts = fixt else: embeds = fixt else: # embeds = Similars(paras_flat).embed().autoencode(save_load_path=vars.AE_PATH).value() embeds = nlp_.sentence_encode(paras_flat).tolist() if for_entries: titles = nlp_.summarization(paras_grouped, min_length=5, max_length=20, with_sentiment=False) texts = nlp_.summarization(paras_grouped, min_length=30, max_length=250) upserts = [] for i, r in enumerate(rows): # Save the cache_entry (paras,clean,vectors) paras = paras_grouped[i] ct = len(paras) id_key = {'entries': 'entry_id', 'profiles': 'user_id'}[method] upserts.append({ id_key: r.id, 'paras': paras, 'vectors': embeds[:ct], }) # Save the fixture for later fixt = (embeds[:ct], titles[i], texts[i]) \ if for_entries else (embeds[:ct],) fixt_k = r.title if for_entries else r.email fixtures.save_nlp_row(fixt_k, fixt, method=method) embeds = embeds[ct:] if for_entries: r.title_summary = titles[i]["summary"] r.text_summary = texts[i]["summary"] r.sentiment = texts[i]["sentiment"] r.ai_ran = True # sess.commit() # deferring till later, so queue up writes for perf m = M.CacheEntry if for_entries else M.CacheUser insert = postgresql.insert(m.__table__).values(upserts) sess.execute(insert.on_conflict_do_update( constraint=m.__table__.primary_key, set_=dict(paras=insert.excluded.paras, vectors=insert.excluded.vectors) )) sess.commit() # recurse to handle any other users with missing cache_{method}. Real return (empty object) # handled at top. return nlp_on_rows_(method, None, job_id, sess, uids)
def load_df(self): if exists(paths.df): logger.info("Load books.df") self.df = pd.read_feather(paths.df)\ .drop(columns=['index'])\ .set_index('id', drop=False) return # invalidate embeddings, they're out of sync try: os.remove(paths.vecs) except: pass logger.info("Load books MySQL") # 58fbd36a: limit to psychology topics sql = f""" select u.ID, u.Title, u.Author, d.descr, t.topic_descr from updated u inner join description d on d.md5=u.MD5 inner join topics t on u.Topic=t.topic_id -- later more languages; but I think it's only Russian in Libgen? and t.lang='en' where u.Language = 'English' -- Make sure there's some content to work with and length(d.descr) > 200 and length(u.Title) > 1 """ with session('books') as sessb: df = pd.read_sql(sql, sessb.bind) df = df.rename(columns=dict( ID='id', descr='text', Title='title', Author='author', topic_descr='topic', )) logger.info(f"n_books before cleanup {df.shape[0]}") logger.info("Remove HTML") # some books are literally just ######## df = df[~(df.title + df.text).str.contains('(\?\?\?|\#\#\#)')] df['text'] = CleanText(df.text.tolist())\ .strip_html()\ .only_ascii()\ .multiple_whitespace()\ .value() df['txt_len'] = df.text.str.len() # Ensure has content. Drop dupes, keeping those w longest description df = df[df.txt_len > 150]\ .sort_values('txt_len', ascending=False)\ .drop_duplicates('id')\ .drop_duplicates(['title', 'author'])\ .drop(columns=['txt_len']) # books = books[books.clean.apply(lambda x: detect(x) == 'en')] logger.info(f"n_books after cleanup {df.shape[0]}") logger.info(f"Save books.df") # Error: feather does not support serializing a non-default index for the index; you can .reset_index() to make the index into column(s) # I get ^ even though no index has yet been set. Have to manually reset_index() anyway df = df.reset_index() df.to_feather(paths.df) # call self, which returns newly-saved df (ensures consistent order, etc) self.load_df()
def test_unmark(content): res = CleanText(content).unmark().value() print(res) assert type(res) == str assert "#" not in res assert "*" not in res
def test_md_split_specific(content): ct, md = content res = CleanText(md).markdown_split_paragraphs().value() print(res) assert len(res) == ct
def test_keywords_fast(content): assert CleanText( content[0]).keywords(mode='fast').join().value() == content[1]
def test_strip_html(content): res = CleanText([content]).strip_html().value() print(res) assert type(res) == str assert "<" not in res