Exemplo n.º 1
0
def gen_keywords(for_entries, uid, sess):
    # finalize cache_entries by generating keywords. Need to do this last so
    # we can generate bigrams of full user corpus of paras (not per-entry)
    if for_entries:
        m, col = M.CacheEntry, M.CacheEntry.entry_id
        # generate bigrams within a user's entry-corpus
        rows = sess.query(m)\
            .options(sa.orm.load_only(col, m.paras))\
            .join(M.Entry, M.Entry.id == col)\
            .filter(M.Entry.user_id==uid, sa.func.array_length(m.paras, 1)>0)\
            .all()
    else:
        m, col = M.CacheUser, M.CacheUser.user_id
        # Generate bigrams across user profiles
        rows = sess.query(m)\
            .options(sa.orm.load_only(col, m.paras))\
            .filter(sa.func.array_length(m.paras, 1) > 0)\
            .all()
    if not rows: return
    paras_flat = [p for row in rows for p in row.paras]
    keywords = CleanText(paras_flat) \
        .keywords(postags=['NOUN', 'ADJ', 'VERB', 'PROPN'], mode='fast', bigram_min_count=2, bigram_threshold=2) \
        .join().value()
    for r in rows:
        ct = len(r.paras)
        kw = keywords[:ct]
        keywords = keywords[ct:]  # unshift for continuation
        r.clean = kw
    sess.commit()
Exemplo n.º 2
0
def test_md_split_1():
    doc = articles()[0]
    paras = CleanText(doc) \
        .markdown_split_paragraphs() \
        .value()
    assert len(paras) > 1
    print(paras)
Exemplo n.º 3
0
def test_md_split_all():
    docs = articles()
    paras = CleanText(docs)\
        .markdown_split_paragraphs()\
        .value()
    assert len(paras) > 0
    assert len(docs) < len(paras)
    print(paras)
Exemplo n.º 4
0
def test_join(content):
    ndim, content = content
    v = CleanText(content).join().value()
    if ndim == 0:
        assert content == 'token1 token2 token3 token4'
    if ndim == 1:
        assert v == 'token1 token2 token3 token4'
    if ndim == 2:
        assert len(v) == 2
        assert v[0] == 'token1 token2'
        assert v[1] == 'token3 token4'
Exemplo n.º 5
0
def test_normalize(fmt, coverage, mode):
    chain = CleanText(articles(fmt=fmt))
    if coverage == "basic":
        chain = chain.keywords(mode=mode)
    else:
        # Revisit this list as cleantext.py grows
        chain = chain\
            .unmark()\
            .strip_html()\
            .normalize_numbers()\
            .fix_punct()\
            .only_english()\
            .only_ascii()\
            .remove_apos()\
            .multiple_whitespace()\
            .keywords(mode=mode)
    clean = chain.join().value()
    assert len(chain.data.lemmas) > 10
    print(chain.data.lemmas[:5])
    assert len(clean) > 10
    print(clean[0])
Exemplo n.º 6
0
def nlp_on_rows_(method, id, job_id, sess, uids):
    """
    Generate cache_{method} for all missing rows for this user. Eg, for entries we generate
    BERT,paras,clean for the entry specified by id; but also fill in any missing entries in the process.
    Then it checks if any users have missing rows (recursive call) and runs for them too.
    """

    # keep job alive, in case we're recursing and it's taking long
    if job_id is not None:
        sess.execute(text(f"""
        update jobs set updated_at={utcnow} where id=:job_id
        """), dict(job_id=str(job_id)))
        sess.commit()

    for_entries = method == 'entries'  # else is_profile
    m = M.Entry if for_entries else M.User
    # TODO id ignored for now, just grab everything that needs updating
    if for_entries:
        # main_entry = sess.query(m).get(id)
        rows_ = sess.query(m).filter(
            m.text.isnot(None),
            m.no_ai.isnot(True),
            m.ai_ran.isnot(True),
            # m.user_id == main_entry.user_id
        ).limit(10)
    else:
        rows_ = sess.query(m).filter(
            m.bio.isnot(None),
            m.ai_ran.isnot(True)
        ).limit(10)
    rows_ = rows_.all()
    # finished recursing
    if not rows_: return uids

    rows = []
    paras_grouped = []
    for r in rows_:
        txt = r.text if for_entries \
            else r.bio  # r.profile_to_text()  # TODO profile_to_text adds people
        paras = CleanText([txt]).markdown_split_paragraphs().value()
        if not paras:
            # Set everything with not-enough-content to ai_ran, and skip
            if for_entries:
                r.title_summary = r.text_summary = r.sentiment = None
            r.ai_ran = True
            sess.commit()
            continue
        uids.add(r.user_id if for_entries else r.id)
        rows.append(r)
        paras_grouped.append(paras)

    # Everything was too-short of content, nothing to do now.
    if not rows: return uids

    paras_flat = [p for paras in paras_grouped for p in paras]

    fkeys = [r.title for r in rows] \
        if for_entries else [r.email for r in rows]
    fixt = fixtures.load_nlp_rows(fkeys, method=method)
    if fixt:
        if for_entries:
            embeds, titles, texts = fixt
        else:
            embeds = fixt
    else:
        # embeds = Similars(paras_flat).embed().autoencode(save_load_path=vars.AE_PATH).value()
        embeds = nlp_.sentence_encode(paras_flat).tolist()
        if for_entries:
            titles = nlp_.summarization(paras_grouped, min_length=5, max_length=20, with_sentiment=False)
            texts = nlp_.summarization(paras_grouped, min_length=30, max_length=250)

    upserts = []
    for i, r in enumerate(rows):
        # Save the cache_entry (paras,clean,vectors)
        paras = paras_grouped[i]
        ct = len(paras)
        id_key = {'entries': 'entry_id', 'profiles': 'user_id'}[method]
        upserts.append({
            id_key: r.id,
            'paras': paras,
            'vectors': embeds[:ct],
        })

        # Save the fixture for later
        fixt = (embeds[:ct], titles[i], texts[i]) \
            if for_entries else (embeds[:ct],)
        fixt_k = r.title if for_entries else r.email
        fixtures.save_nlp_row(fixt_k, fixt, method=method)

        embeds = embeds[ct:]

        if for_entries:
            r.title_summary = titles[i]["summary"]
            r.text_summary = texts[i]["summary"]
            r.sentiment = texts[i]["sentiment"]
        r.ai_ran = True
        # sess.commit()  # deferring till later, so queue up writes for perf

    m = M.CacheEntry if for_entries else M.CacheUser
    insert = postgresql.insert(m.__table__).values(upserts)
    sess.execute(insert.on_conflict_do_update(
        constraint=m.__table__.primary_key,
        set_=dict(paras=insert.excluded.paras, vectors=insert.excluded.vectors)
    ))
    sess.commit()

    # recurse to handle any other users with missing cache_{method}. Real return (empty object)
    # handled at top.
    return nlp_on_rows_(method, None, job_id, sess, uids)
Exemplo n.º 7
0
    def load_df(self):
        if exists(paths.df):
            logger.info("Load books.df")
            self.df = pd.read_feather(paths.df)\
                .drop(columns=['index'])\
                .set_index('id', drop=False)
            return

        # invalidate embeddings, they're out of sync
        try:
            os.remove(paths.vecs)
        except:
            pass

        logger.info("Load books MySQL")
        # 58fbd36a: limit to psychology topics
        sql = f"""
        select u.ID, u.Title, u.Author, d.descr, t.topic_descr
        from updated u
            inner join description d on d.md5=u.MD5
            inner join topics t on u.Topic=t.topic_id
                -- later more languages; but I think it's only Russian in Libgen?
                and t.lang='en'
        where u.Language = 'English'
            -- Make sure there's some content to work with
            and length(d.descr) > 200 and length(u.Title) > 1
        """
        with session('books') as sessb:
            df = pd.read_sql(sql, sessb.bind)
        df = df.rename(columns=dict(
            ID='id',
            descr='text',
            Title='title',
            Author='author',
            topic_descr='topic',
        ))

        logger.info(f"n_books before cleanup {df.shape[0]}")
        logger.info("Remove HTML")

        # some books are literally just ########
        df = df[~(df.title + df.text).str.contains('(\?\?\?|\#\#\#)')]

        df['text'] = CleanText(df.text.tolist())\
            .strip_html()\
            .only_ascii()\
            .multiple_whitespace()\
            .value()
        df['txt_len'] = df.text.str.len()
        # Ensure has content. Drop dupes, keeping those w longest description
        df = df[df.txt_len > 150]\
            .sort_values('txt_len', ascending=False)\
            .drop_duplicates('id')\
            .drop_duplicates(['title', 'author'])\
            .drop(columns=['txt_len'])
        # books = books[books.clean.apply(lambda x: detect(x) == 'en')]
        logger.info(f"n_books after cleanup {df.shape[0]}")

        logger.info(f"Save books.df")
        # Error: feather does not support serializing a non-default index for the index; you can .reset_index() to make the index into column(s)
        # I get ^ even though no index has yet been set. Have to manually reset_index() anyway
        df = df.reset_index()
        df.to_feather(paths.df)
        # call self, which returns newly-saved df (ensures consistent order, etc)
        self.load_df()
Exemplo n.º 8
0
def test_unmark(content):
    res = CleanText(content).unmark().value()
    print(res)
    assert type(res) == str
    assert "#" not in res
    assert "*" not in res
Exemplo n.º 9
0
def test_md_split_specific(content):
    ct, md = content
    res = CleanText(md).markdown_split_paragraphs().value()
    print(res)
    assert len(res) == ct
Exemplo n.º 10
0
def test_keywords_fast(content):
    assert CleanText(
        content[0]).keywords(mode='fast').join().value() == content[1]
Exemplo n.º 11
0
def test_strip_html(content):
    res = CleanText([content]).strip_html().value()
    print(res)
    assert type(res) == str
    assert "<" not in res