Пример #1
0
 def prune():
     with session() as sess:
         """prune machines which haven't removed themselves properly"""
         sess.execute(f"""
         delete from machines where updated_at < now() - interval '5 minutes' 
         """)
         sess.commit()
Пример #2
0
def await_job(jid):
    with session() as sess:
        params = {'jid': jid}
        i = 0
        while True:
            time.sleep(1)
            job = sess.execute(
                text("""
            select state, method from jobs where id=:jid
            """), params).fetchone()

            # TODO notify them of error?
            # 10 minutes, give up
            if job.state == 'error' or i > 60 * 10:
                return Box(method=job.method, data_out=False)

            if job.state == 'done':
                ## don't delete actually, let Job.prune handle that. Need last_job (created_at)
                #delete from jobs where id=:jid returning method, data_out
                job = sess.execute(
                    text("""
                select method, data_out from jobs where id=:jid
                """), params).fetchone()
                sess.commit()
                return job
            i += 1
Пример #3
0
def ga(uid: Union[str, UUID4], category: str, action: str):
    """
    I'm only tracking interesting server-side events right now (no cookies), I want to see what features
    are being used and important bits like sign-ups & book-thumbs. user-id is obfuscated
    https://developers.google.com/analytics/devguides/collection/protocol/v1/devguide#event
    """
    # actually don't care about uid, just need a unique identifier. Note this is
    # a 1-way hash (right?), so I can't even decrypt - just want unique per-feature track
    uid_ = str(uid).encode()  # to bytes
    uid_ = hashlib.sha256(uid_).hexdigest()
    url = "https://ssl.google-analytics.com/"
    url += "debug/collect" if DEBUG else "collect"
    res = requests.post(url,
                        params=dict(v=1,
                                    tid=vars.GA,
                                    cid=uid_,
                                    t='event',
                                    ec=category,
                                    ea=action))
    # if DEBUG: print(res.json())

    if action in ('register', 'like', 'dislike', 'therapist', 'notes'):
        with session() as sess:
            if sess.execute(
                    text("""
            select is_superuser su from users where id=:uid
            """), dict(uid=uid)).fetchone().su:
                # don't notify of my own or Lisa's entries
                return
        send_mail('*****@*****.**', 'action',
                  dict(category=category, action=action))
Пример #4
0
def nlp_on_rows(method, id, job_id):
    for_entries = method == 'entries'
    with session() as sess:
        uids = nlp_on_rows_(method, id, job_id, sess, set())

        # profiles doesn't use uids, but create 1-el array to iterate anyway
        if not for_entries: uids = [None]

        for i, uid in enumerate(uids):
            uid = str(uid)
            gen_keywords(for_entries, uid, sess)

            if for_entries:
                # Then add a book job for every user who was affected. Delay by x minutes per
                # job (jobs are pruned by updated_at < ?, so posting to the future)
                # 9131155e: only update every x entries
                future = datetime.utcnow() + timedelta(minutes=i*3)
                sess.add(M.Job(
                    method='books',
                    data_in={'args': [uid]},
                    created_at=future,
                    updated_at=future,
                ))
                sess.commit()
            else:
                match_profiles(sess)
        return {}
Пример #5
0
def themes(eids, algo='agglomorative'):
    logger.info("Themes")
    with session() as sess:
        # use Model to decrypt fields
        E, CE = M.Entry, M.CacheEntry
        res = sess.query(CE) \
            .with_entities(CE.paras, CE.clean, CE.vectors) \
            .join(E, E.id == CE.entry_id) \
            .filter(E.id.in_(eids), func.array_length(CE.vectors,1)>0) \
            .order_by(E.created_at.desc()) \
            .all()
    # assert len(eids) == len(res)
    entries = pd.Series([e for r in res for e in r.paras])
    stripped = pd.Series([c for r in res for c in r.clean])
    vecs = np.vstack([r.vectors for r in res]).astype(np.float32)

    chain = Similars(vecs)
    if False and os.path.exists(vars.AE_PATH):
        chain = chain.autoencode(filename=vars.AE_PATH).cluster(algo=algo)
    else:
        chain = chain.normalize().cluster(algo=algo)
    clusters = chain.value()
    labels = chain.data.labels

    topics = []
    for l, center in enumerate(clusters):
        mask = labels == l
        n_entries = mask.sum().item()
        print('n_entries', n_entries)
        if n_entries < 2:
            print('skipping')
            continue

        vecs_, stripped_, entries_ = vecs[mask], stripped[mask], entries[mask]

        dists = Similars(center,
                         vecs_).normalize().cosine(abs=True).value().squeeze()
        entries_ = entries_.iloc[dists.argsort()].tolist()[:5]

        terms = top_terms(stripped_.tolist())
        topics.append({
            'n_entries': n_entries,
            'terms': terms,
            'summary': entries_,  # add full thing, will batch-compute next
            'sentiment': None,
        })

    groups = [t['summary'] for t in topics]
    batch_summaries = nlp_.summarization(groups, min_length=50, max_length=300)
    for i, res in enumerate(batch_summaries):
        print(res)
        topics[i]['summary'] = res['summary']
        topics[i]['sentiment'] = res['sentiment']

    topics = {'terms': top_terms(stripped, 10), 'themes': topics}

    return topics
Пример #6
0
 def prune():
     with session() as sess:
         """prune completed or stuck jobs. Completed jobs aren't too useful for admins; error is."""
         sess.execute(f"""
         delete from jobs 
         where updated_at < now() - interval '10 minutes' 
             and state in ('working', 'done') 
         """)
         sess.commit()
Пример #7
0
def db(client):
    """await client to init_db"""
    with D.session() as sess:
        # wait for GPU to restart from no-db crash
        while True:
            sql = "select 1 from machines where status='on'"
            if M.await_row(sess, sql): break
            time.sleep(.5)

        yield sess
Пример #8
0
def themes(eids, algo='agglomorative'):
    with session() as sess:
        # use Model to decrypt fields
        res = sess.query(M.CacheEntry)\
            .with_entities(M.CacheEntry.paras, M.CacheEntry.clean, M.CacheEntry.vectors)\
            .join(M.Entry, M.Entry.id == M.CacheEntry.entry_id)\
            .filter(M.Entry.id.in_(eids))\
            .order_by(M.Entry.created_at.desc())\
            .all()
    # assert len(eids) == len(res)
    entries = pd.Series([e for r in res for e in r.paras])
    stripped = pd.Series([c for r in res for c in r.clean])
    vecs = []
    for r in res:
        if r.vectors: vecs += r.vectors
    # if not vecs: return False  # TODO somethign else to return?
    vecs = np.vstack(vecs).astype(np.float32)

    clusters = Similars(vecs).normalize().cluster(algo=algo).value()

    topics = []
    for l in range(clusters.max()):
        in_clust = clusters == l
        n_entries = in_clust.sum().item()
        print('n_entries', n_entries)
        if n_entries < 2:
            print('skipping')
            continue

        vecs_, stripped_, entries_ = vecs[in_clust],\
            stripped.iloc[in_clust], entries.iloc[in_clust]

        center = vecs_.mean(axis=0)[np.newaxis, :]
        dists = Similars(center, vecs_).normalize().cosine().value().squeeze()
        entries_ = entries_.iloc[dists.argsort()].tolist()[:5]

        terms = top_terms(stripped_.tolist())
        topics.append({
            'n_entries': n_entries,
            'terms': terms,
            'summary': entries_,  # add full thing, will batch-compute next
            'sentiment': None,
        })

    groups = [t['summary'] for t in topics]
    batch_summaries = nlp_.summarization(groups, min_length=50, max_length=300)
    for i, res in enumerate(batch_summaries):
        print(res)
        topics[i]['summary'] = res['summary']
        topics[i]['sentiment'] = res['sentiment']

    topics = {'terms': top_terms(stripped, 10), 'themes': topics}

    return topics
Пример #9
0
def influencers():
    with session() as sess:
        users = sess.execute(
            text(f"""
        select id::text from users
        where
          -- has logged in recently
          updated_at > now() - interval '2 days' and
          -- has been 1d since last-run (or never run)
          (extract(day from now() - last_influencers) >= 1 or last_influencers is null)
        """)).fetchall()
        for u in users:
            uid_ = dict(uid=u.id)
            sess.execute(
                text(f"""
            update users set last_influencers=now() where id=:uid
            """), uid_)
            sess.commit()

            res = influencers_(sess, u.id)
            if not res: continue

            # A field can get deleted while running XGB, causing a fkey constraint error.
            # https://docs.sqlalchemy.org/en/13/dialects/postgresql.html
            # Can't do on_conflict for FK constraints, get fresh ids and filter out missing ones.
            fids = [
                x.id for x in sess.execute(
                    text("""
            select id::text from fields where user_id=:uid
            """), uid_).fetchall()
            ]

            next_preds, importances, all_imps = res
            for fid, others in importances.items():
                if fid not in fids: continue
                inf_score, next_pred = all_imps[fid], next_preds[fid]

                insert = postgresql.insert(M.Influencer.__table__).values([
                    dict(field_id=fid, influencer_id=inf_id, score=score)
                    for inf_id, score in others.items() if inf_id in fids
                ])
                sess.execute(
                    insert.on_conflict_do_update(
                        constraint=M.Influencer.__table__.primary_key,
                        set_=dict(score=insert.excluded.score)))
                sess.execute(
                    text("""
                update fields set influencer_score=:score, next_pred=:pred
                where id=:fid;
                """), dict(score=inf_score, pred=next_pred, fid=fid))
                sess.commit()

    return {}
Пример #10
0
def cloud_up_maybe():
    if is_dev(): return
    with session() as sess:
        if M.User.last_checkin(sess) > 15: return
        if M.Machine.gpu_status(sess) in ("on", "pending"): return

        logger.warning("Initing AWS Batch")
        M.Machine.notify_online(sess, 'batch', 'pending')
        boto3.client('batch').submit_job(
            jobName=str(uuid4()),
            jobQueue='gnothi-jq',
            jobDefinition='gnothi-jd',
        )
Пример #11
0
def predict_books(user_id, vecs_user, n_recs=30, centroids=False):
    with session() as sess:
        # TODO should I move this down further, to get more lines to test?
        fixt = fixtures.load_books(user_id)
        if fixt is not None: return fixt

        vecs_books, books = load_books(sess)
        sql = "select book_id as id, user_id, shelf from bookshelf where user_id=%(uid)s"
        shelf = pd.read_sql(sql, sess.bind, params={'uid': user_id}).set_index('id', drop=False)
    shelf_idx = books.id.isin(shelf.id)

    # normalize for cosine, and downstream DNN
    chain = Similars(vecs_user, vecs_books).normalize()
    vecs_user, vecs_books = chain.value()

    logger.info("Finding cosine similarities")
    if centroids:
        labels = chain.agglomorative().value()
        lhs = np.vstack([
            vecs_user[labels == l].mean(0)
            for l in range(labels.max())
        ])
        chain = Similars(lhs, vecs_user)

    # Take best cluster-score for every book
    dist = chain.cosine().value().min(axis=0)
    # 0f29e591: minmax_scale(dist). norm_out=True works better
    # then map back onto books, so they're back in order (pandas index-matching)
    books['dist'] = dist

    if shelf_idx.sum() > 0:
        like, dislike = dist.min() - dist.std(), dist.max() + dist.std()
        shelf_map = dict(like=like, already_read=like, recommend=like, dislike=dislike, remove=None, ai=None)
        shelf['dist'] = shelf.shelf.apply(lambda k: shelf_map[k])
        shelf.dist.fillna(books.dist, inplace=True)  # fill in "remove"
        books.loc[shelf.index, 'dist'] = shelf.dist  # indexes(id) match, so assigns correctly
        assert not books.dist.isna().any(), "Messed up merging shelf/books.dist by index"

    # e2eaea3f: save/load dnn
    dnn = train_books_predictor(books, vecs_books, shelf_idx)

    books['dist'] = dnn.predict(vecs_books)

    # dupes by title in libgen
    # r = books.sort_values('dist')\
    df = books[~shelf_idx].sort_values('dist')\
        .drop_duplicates('title', keep='first')\
        .iloc[:n_recs]
    fixtures.save_books(user_id, df)
    return df
Пример #12
0
    def multiple_book_jobs(uids):
        with session() as sess:
            sess.execute(
                satext("""
            update users set last_books=null where id in :uids;
            """), dict(uids=tuple(uids)))
            sess.commit()
        # TODO handle this in run.py when it's consuming jobs
        def delay_books(uid, i):
            time.sleep(i * 60 * 5)  # 5m
            Job.create_job('books', data_in=dict(args=[str(uid)]))

        for i, uid in enumerate(uids):
            threading.Thread(target=delay_books, args=(uid, i)).start()
Пример #13
0
 def wrap_job(jid, method, fn):
     logger.info(f"Run job {method}")
     try:
         start = time.time()
         res = fn()
         sql = "update jobs set state='done', data_out=:data where id=:jid"
         logger.info(f"Job {method} complete {time.time() - start}")
     except Exception as err:
         err = str(traceback.format_exc())  # str(err)
         res = dict(error=err)
         sql = "update jobs set state='error', data_out=:data where id=:jid"
         logger.error(f"Job {method} error {time.time() - start} {err}")
     with session() as sess:
         sess.execute(satext(sql), dict(data=jsonb(res), jid=str(jid)))
         sess.commit()
Пример #14
0
def match_profiles():
    with session() as sess:
        df = pd.read_sql(
            """
        select e.user_id, c.vectors from cache_entries c
        inner join entries e on e.id=c.entry_id
        where c.vectors is not null
        """, sess.bind)
        if not df.shape[0]: return

        # flatten multi-paragraph entries
        df['vectors'] = df.vectors.apply(mean_)
        # then mean the semantic of all entries for this user.
        # TODO cluster or something, just mean-ing all their entries is stupid
        df = df.groupby(['user_id']).vectors.agg(mean_)

        uids = df.index.tolist()
        vecs_entries = np.vstack(df.values)

        # TODO add community (M.User.public == True)
        df = pd.read_sql(
            """
        select c.user_id, c.vectors from cache_users c
        inner join users u on c.user_id=u.id 
        where u.therapist=true and c.vectors is not null 
        """, sess.bind)
        if not df.shape[0]: return
        match_ids = df.user_id.tolist()
        # This on the other hand is OK to mean, it's just their profile
        vecs_profiles = np.vstack(df.vectors.apply(mean_).values)

        dists = Similars(vecs_entries,
                         vecs_profiles).normalize().cosine().value()

        sess.execute(
            text("""
        delete from profile_matches where user_id in :uids
        """), dict(uids=tuple(uids)))
        sess.commit()

        # everything is in same order at this point
        sess.bulk_save_objects([
            M.ProfileMatch(user_id=uid, match_id=mid, score=dists[i, j])
            for i, uid in enumerate(uids) for j, mid in enumerate(match_ids)
        ])
        sess.commit()
Пример #15
0
 def clear_fixtures(self):
     all_ = FRESH == 'all'
     if 'books' in FRESH or all_:
         self.rm(f"{BASE}/books.pkl")
     if 'entries' in FRESH or all_:
         self.rm(f"{BASE}/entries.pkl")
         self.rm(f"{BASE}/nlp_entries.pkl")
     if 'profiles' in FRESH or all_:
         self.rm(f"{BASE}/nlp_profiles.pkl")
     if 'wiki' in FRESH or all_:
         self.rm(f"{BASE}/wiki", isdir=True)
     if 'influencers' in FRESH or all_:
         self.rm(f"{BASE}/xgb_hypers.pkl")
     if 'liben' in FRESH or all_:
         self.rm(f"/storage/libgen_testing.npy")
         with session() as sess:
             sess.execute("delete from books")
             sess.commit()
Пример #16
0
def run_job(job):
    jid_, k = str(job.id), job.method
    jid = {'jid': jid_}
    with session() as sess:
        data = sess.execute("select data_in from jobs where id=:jid", jid).fetchone().data_in
    args = data.get('args', [])
    kwargs = data.get('kwargs', {})

    if k in ('entries', 'profiles'):
        kwargs['job_id'] = jid_

    if k == 'books':
        nlp_.clear()
        os.system(f"python app/books.py --jid={jid_} --uid={args[0]}")
        return

    def fn(): return m[k](*args, **kwargs)
    M.Job.wrap_job(jid_, k, fn)
Пример #17
0
def cloud_up_maybe():
    if is_dev(): return
    with session() as sess:
        if M.User.last_checkin(sess) > 10: return
        if M.Machine.gpu_status(sess) != "off": return

        logger.warning("Initing Paperspace")
        M.Machine.notify_online(sess, 'paperspace', 'pending')
        jobs = job_client.list()
        if any([j.state in up_states for j in jobs]):
            return

        vars_ = {**dict(vars), **{'MACHINE': 'paperspace'}}
        return job_client.create(machine_type='K80',
                                 container='lefnire/gnothi:gpu-0.0.13',
                                 project_id=vars.PAPERSPACE_PROJECT_ID,
                                 is_preemptible=True,
                                 command='python app/run.py',
                                 env_vars=vars_)
Пример #18
0
def run_books(user_id):
    with session() as sess:
        user_id = str(user_id)
        uid = {'uid': user_id}

        # don't run if ran recently (notice the inverse if & comparator, simpler)
        if sess.execute(text(f"""
        select 1 from users 
        where id=:uid and last_books > {utcnow} - interval '10 minutes' 
        """), uid).fetchone():
            return
        sess.execute(text(f"""
        update users set last_books={utcnow} where id=:uid
        """), uid)
        sess.commit()

        entries = sess.execute(text("""
        select c.vectors from cache_entries c
        inner join entries e on e.id=c.entry_id and e.user_id=:uid
        order by e.created_at desc;
        """), uid).fetchall()
        profile = sess.execute(text("""
        select vectors from cache_users where user_id=:uid
        """), uid).fetchone()

        vecs = []
        if profile and profile.vectors:
            vecs += profile.vectors
        for e in entries:
            if e.vectors: vecs += e.vectors
        vecs = np.vstack(vecs).astype(np.float32)
        res = predict_books(user_id, vecs)

        sess.execute(text("""
        delete from bookshelf where user_id=:uid and shelf='ai'
        """), uid)
        sess.commit()
        res = res.rename(columns=dict(id='book_id', dist='score'))[['book_id', 'score']]
        res['user_id'] = user_id
        res['shelf'] = 'ai'
        res['created_at'] = res['updated_at'] = datetime.datetime.utcnow()
        res.to_sql('bookshelf', sess.bind, if_exists='append', index=False)
Пример #19
0
    def create_job(method, data_in={}, **kwargs):
        """
        Ensures certain jobs only created once at a time. Never manually add Job() call this instead
        """
        with session() as sess:
            arg0 = data_in.get('args', [None])[0]
            if type(arg0) != str: arg0 = None

            # For entries, profiles: set ai_ran=False to queue them into the next batch,
            # then arg0 isn't used downstream (was previously).
            if method in ('entries', 'profiles') and arg0:
                table = dict(entries='entries', profiles='users')[method]
                sess.execute(
                    satext(f"""
                update {table} set ai_ran=False where id=:id;
                """), dict(id=arg0))
                sess.commit()

            exists = sess.execute(
                satext("""
            select 1 from jobs
            -- maybe if we're mid-job, things have changed; so don't incl. working? rethink 
            --where method=:method and state in ('new', 'working') and
            where method=:method and state='new' and
            case
                when method='influencers' then true
                when method='books' and data_in->'args'->>0=:arg0 then true
                when method='entries' then true
                when method='profiles' then true
                when method='habitica' then true
                else false
            end
            """), dict(method=method, arg0=arg0)).fetchone()
            if exists: return False

            j = Job(method=method, data_in=data_in, **kwargs)
            sess.add(j)
            sess.commit()
            sess.refresh(j)
            return str(j.id)
Пример #20
0
def predict_books(user_id, vecs_user, n_recs=30):
    with session() as sess:
        # TODO should I move this down further, to get more lines to test?
        fixt = fixtures.load_books(user_id)
        if fixt is not None: return fixt
        vecs_books, books = load_books(sess, user_id)

    # normalize for cosine, and downstream DNN
    chain = Similars(vecs_user, vecs_books).normalize()
    vecs_user, vecs_books = chain.value()

    logger.info("Finding cosine similarities")

    # Take best cluster-score for every book
    dist = chain.cosine(abs=True).value().min(axis=0)
    # 0f29e591: minmax_scale(dist). norm_out=True works better
    # then map back onto books, so they're back in order (pandas index-matching)

    # Push highly-rated books up, low-rated books down. Do that even stronger for user's own ratings.
    # Using negative-score because cosine DISTANCE (less is better)
    books['dist'] = dist
    books['dist'] = books.dist \
        + (books.dist.std() * -books.global_score / 2.) \
        + (books.dist.std() * -books.user_score)
    assert not books.dist.isna().any(), "Messed up merging shelf/books.dist by index"


    # e2eaea3f: save/load dnn
    dnn = train_books_predictor(books, vecs_books)

    books['dist'] = dnn.predict(vecs_books)

    # dupes by title in libgen
    # r = books.sort_values('dist')\
    df = books[~books.user_rated].sort_values('dist')\
        .drop_duplicates('title', keep='first')\
        .iloc[:n_recs]
    fixtures.save_books(user_id, df)
    return df
Пример #21
0
def await_job(jid):
    with session() as sess:
        params = {'jid': jid}
        i = 0
        while True:
            time.sleep(1)
            job = sess.execute(
                text("""
            select state, method from jobs where id=:jid
            """), params).fetchone()

            # TODO notify them of error?
            # 10 minutes, give up
            if job.state == 'error' or i > 60 * 10:
                return Box(method=job.method, data_out=False)

            if job.state == 'done':
                job = sess.execute(
                    text("""
                select method, data_out from jobs where id=:jid
                """), params).fetchone()
                sess.commit()
                return job
            i += 1
Пример #22
0
        return m[k](*args, **kwargs)

    M.Job.wrap_job(jid_, k, fn)
    # 3eb71b3: unloading models. multiprocessing handles better


if __name__ == '__main__':
    logger.info(f"torch.cuda.current_device() {torch.cuda.current_device()}")
    logger.info(f"torch.cuda.device(0) {torch.cuda.device(0)}")
    logger.info(f"torch.cuda.device_count() {torch.cuda.device_count()}")
    logger.info(
        f"torch.cuda.get_device_name(0) {torch.cuda.get_device_name(0)}")
    logger.info(f"torch.cuda.is_available() {torch.cuda.is_available()}")
    logger.info("\n\n")

    with session() as sess:
        while True:
            M.Machine.notify_online(sess, vars.MACHINE)
            cloud_down_maybe(sess)

            # only allow 2 jobs at a time.
            if M.Machine.job_ct_on_machine(sess, vars.MACHINE) >= 2:
                time.sleep(1)
                continue

            # Find jobs
            job = M.Job.take_job(sess, "run_on='gpu'")
            if job:
                # aaf1ec95: multiprocessing.Process for problem models
                threading.Thread(target=run_job, args=(job, )).start()
                # run_job(job.id)
Пример #23
0
 def eid_to_title(self, eid):
     with session() as sess:
         return sess.execute(
             text("""
         select title from entries where id=:eid
         """), dict(eid=eid)).fetchone().title
Пример #24
0
 def uid_to_email(self, uid):
     with session() as sess:
         return sess.execute(
             text("""
         select email from users where id=:uid
         """), dict(uid=uid)).fetchone().email
Пример #25
0
def load_books_df(sess):
    # sort asc since that's how we mapped to vecs in first place (order_values)
    df = pd.read_sql("select * from books order by id asc", sess.bind)
    if df.shape[0]:
        return df.set_index('id', drop=False)

    logger.info("Load books MySQL")
    FIND_PROBLEMS = False
    ALL_BOOKS = False

    # for-sure psych. See tmp/topics.txt, or libgen.sql topics(lang='en')
    psych_topics = 'psychology|self-help|therapy'
    # good other psych topics, either mis-categorized or other
    psych_topics += '|anthropology|social|religion'
    psych_topics += '|^history|^education'

    sql = Box(
        select="select u.ID, u.Title, u.Author, d.descr, t.topic_descr",
        body="""
            from updated u
                inner join description d on d.md5=u.MD5
                inner join topics t on u.Topic=t.topic_id
                    and t.lang='en'
            where u.Language = 'English'
                and title not regexp 'sams|teach yourself'  -- remove junk
                and (length(d.descr) + length(u.Title)) > 200
            and u.ID not in ('62056','72779','111551','165602','165606','239835','240399','272945','310202','339718','390651','530739','570667','581466','862274','862275','879029','935149','1157279','1204687','1210652','1307307','1410416','1517634','1568907','1592543','2103755','2128089','2130515','2187329','2270690','2270720','2275684','2275804','2277017','2284616','2285559','2314405','2325313','2329959','2340421','2347272','2374055','2397307','2412259','2420958','2421152','2421413','2423975')
            """,

        # handle u.Topic='' (1326526 rows)
        just_psych=f"and t.topic_descr regexp '{psych_topics}'",

        # find_problems
        just_ids="select distinct u.ID",
        where_id="and u.ID=:id"
    )

    if FIND_PROBLEMS:
        # # Those MD5s: UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 636: character maps to <undefined>
        # TODO try instead create_engine(convert_unicode=True)

        with session('books') as sessb:
            ids = ' '.join([sql.just_ids, sql.body])
            ids = [x.ID for x in sessb.execute(ids).fetchall()]
            problem_ids = []
            for i, id in enumerate(tqdm(ids)):
                if i % 10000 == 0:
                    problems = len(problem_ids) / len(ids) * 100
                    logger.info(f"{problems}% problems")
                try:
                    row = ' '.join([sql.select, sql.body, sql.where_id])
                    sessb.execute(text(row), {'id': id})
                except:
                    problem_ids.append(id)
        problem_ids = ','.join([f"'{id}'" for id in problem_ids])
        logger.info(f"and u.ID not in ({problem_ids})")
        exit(0)

    sql_ = [sql.select, sql.body]
    if not ALL_BOOKS: sql_ += [sql.just_psych]
    sql_ = ' '.join(sql_)
    with session('books') as sessb:
        df = pd.read_sql(sql_, sessb.bind)
    df = df.drop_duplicates(['Title', 'Author'])

    logger.info(f"n_books before cleanup {df.shape[0]}")
    logger.info("Removing HTML")
    broken = '(\?\?\?|\#\#\#)'  # russian / other FIXME better way to handle
    df = df[~(df.Title + df.descr).str.contains(broken)] \
        .drop_duplicates(['Title', 'Author'])  # TODO reconsider

    df['descr'] = cleantext.multiple(df.descr.tolist(), [
        cleantext.strip_html,
        cleantext.fix_punct,
        cleantext.only_ascii,
        cleantext.multiple_whitespace,
        cleantext.unmark
    ])

    # books = books[books.clean.apply(lambda x: detect(x) == 'en')]
    logger.info(f"n_books after cleanup {df.shape[0]}")

    df = df.rename(columns=dict(
        ID='id',
        descr='text',
        Title='title',
        Author='author',
        topic_descr='topic',
    ))

    # drop dupes, keep longest desc
    df['txt_len'] = df.text.str.len()
    df = df.sort_values('txt_len', ascending=False)\
        .drop_duplicates('id')\
        .drop(columns=['txt_len'])\
        .sort_values('id')
    df['thumbs'] = 0

    logger.info(f"Saving books to DB")
    df.to_sql('books', sess.bind, index=False, chunksize=500, if_exists='append', method='multi')

    return df.set_index('id', drop=False)
Пример #26
0
    create_database(vars.DB_FULL)

import pytest, time
from box import Box
from fastapi.testclient import TestClient
from lorem_text import lorem

import common.database as D
import common.models as M
from common.fixtures import fixtures
from app.main import app

import logging
logger = logging.getLogger(__name__)

with D.session() as sess:
    for t in """
    bookshelf
    cache_entries
    cache_users
    entries
    entries_tags
    field_entries
    fields
    jobs
    machines
    notes
    people
    shares
    shares_tags
    tags
Пример #27
0
def db_books():
    with D.session('books') as sess:
        yield sess
Пример #28
0
    def load_df(self):
        if exists(paths.df):
            logger.info("Load books.df")
            self.df = pd.read_feather(paths.df)\
                .drop(columns=['index'])\
                .set_index('id', drop=False)
            return

        # invalidate embeddings, they're out of sync
        try:
            os.remove(paths.vecs)
        except:
            pass

        logger.info("Load books MySQL")
        # 58fbd36a: limit to psychology topics
        sql = f"""
        select u.ID, u.Title, u.Author, d.descr, t.topic_descr
        from updated u
            inner join description d on d.md5=u.MD5
            inner join topics t on u.Topic=t.topic_id
                -- later more languages; but I think it's only Russian in Libgen?
                and t.lang='en'
        where u.Language = 'English'
            -- Make sure there's some content to work with
            and length(d.descr) > 200 and length(u.Title) > 1
        """
        with session('books') as sessb:
            df = pd.read_sql(sql, sessb.bind)
        df = df.rename(columns=dict(
            ID='id',
            descr='text',
            Title='title',
            Author='author',
            topic_descr='topic',
        ))

        logger.info(f"n_books before cleanup {df.shape[0]}")
        logger.info("Remove HTML")

        # some books are literally just ########
        df = df[~(df.title + df.text).str.contains('(\?\?\?|\#\#\#)')]

        df['text'] = CleanText(df.text.tolist())\
            .strip_html()\
            .only_ascii()\
            .multiple_whitespace()\
            .value()
        df['txt_len'] = df.text.str.len()
        # Ensure has content. Drop dupes, keeping those w longest description
        df = df[df.txt_len > 150]\
            .sort_values('txt_len', ascending=False)\
            .drop_duplicates('id')\
            .drop_duplicates(['title', 'author'])\
            .drop(columns=['txt_len'])
        # books = books[books.clean.apply(lambda x: detect(x) == 'en')]
        logger.info(f"n_books after cleanup {df.shape[0]}")

        logger.info(f"Save books.df")
        # Error: feather does not support serializing a non-default index for the index; you can .reset_index() to make the index into column(s)
        # I get ^ even though no index has yet been set. Have to manually reset_index() anyway
        df = df.reset_index()
        df.to_feather(paths.df)
        # call self, which returns newly-saved df (ensures consistent order, etc)
        self.load_df()
Пример #29
0
def influencers_(user_id):
    logging.info("Influencers")
    with session() as sess:
        fes = pd.read_sql("""
        -- remove duplicates, use average. FIXME find the dupes bug
        with fe_clean as (
            select field_id, created_at::date, avg(value) as value
            from field_entries
            group by field_id, created_at::date
        ),
        -- ensure enough data
        fe_ct as (
          select field_id from fe_clean 
          group by field_id having count(value) > 5
        )
        select  
          fe.created_at, -- index 
          fe.field_id::text, -- column, uuid->string
          fe.value -- value
        from fe_clean fe
        inner join fe_ct on fe_ct.field_id=fe.field_id  -- just removes rows
        inner join fields f on f.id=fe.field_id
        where f.user_id=%(uid)s
          and f.excluded_at is null
        order by fe.created_at asc
        """, sess.bind, params={'uid': user_id})
        if not fes.size: return None  # not enough entries

        params = dict(
            uid=user_id,
            fids=tuple(fes.field_id.unique().tolist())
        )
        fs = pd.read_sql("""
        select id::text, default_value, default_value_value
        from fields
        where user_id=%(uid)s
            and id in %(fids)s
            and excluded_at is null
        """, sess.bind, params=params)

    fs = {r.id: r for i, r in fs.iterrows()}

    # Easier pivot debugging
    # fields['field_id'] =  fields.field_id.apply(lambda x: x[0:4])
    fes = fes.pivot(index='created_at', columns='field_id', values='value')

    # fes = fes.resample('D')
    cols = fes.columns.tolist()

    hypers = hyperopt(fes, fs, user_id)
    xgb_args = {}  # {'tree_method': 'gpu_hist', 'gpu_id': 0}

    next_preds = {}
    importances = {}
    all_imps = []
    for t in cols:
        # remove up until they start tracking; we'll impute from there on up
        fvi = fes[t].first_valid_index()
        fes_ = impute_and_roll(fes[fvi:].copy(), fs)

        ### Next Preds
        ### ----------
        # For next-pred, we keep target column. Yes, likely most predictive; but a rolling
        # trend is important info
        X = fes_
        y = X[t]
        model = XGBRegressor(**xgb_args, **hypers)
        model.fit(X, y)
        preds = model.predict(X.iloc[-1:])
        next_preds[t] = float(preds[0])
        # model.fit(X, y)  # what's this? was I fitting twice?

        ### Importances
        ### -----------
        X = fes_.drop(columns=[t])
        y = fes_[t]
        model = XGBRegressor(**xgb_args, **hypers)
        model.fit(X, y)
        imps = [float(x) for x in model.feature_importances_]

        # FIXME
        # /xgboost/sklearn.py:695: RuntimeWarning: invalid value encountered in true_divide return all_features / all_features.sum()
        # I think this is due to target having no different value, in which case
        # just leave like this.
        imps = [0. if np.isnan(imp) else imp for imp in imps]

        # put target col back in
        imps.insert(cols.index(t), 0.0)
        dict_ = dict(zip(cols, imps))
        all_imps.append(dict_)
        importances[t] = dict_

    all_imps = dict(pd.DataFrame(all_imps).mean())
    return next_preds, importances, all_imps
Пример #30
0
def nlp_on_rows(method='entries'):
    for_entries = method == 'entries'  # else is_profile
    with session() as sess:
        if for_entries:
            rows = sess.query(M.Entry) \
                .filter(
                    func.length(M.Entry.text) > 64,
                    M.Entry.no_ai.isnot(True),
                    M.Entry.ai_ran.isnot(True)
                )
        else:
            rows = sess.query(M.User) \
                .filter(
                    func.length(M.User.bio) > 32,
                    M.User.ai_ran.isnot(True)
                )
        rows = rows.all()
        if not rows: return {}

        paras_grouped = []
        uids = set()
        for r in rows:
            txt = r.text if for_entries \
                else r.bio  # r.profile_to_text()  # TODO profile_to_text adds people
            paras_grouped.append(cleantext.markdown_split_paragraphs([txt]))
            if for_entries:
                uids.add(r.user_id)
        paras_flat = [p for paras in paras_grouped for p in paras]

        fkeys = [r.title for r in rows] \
            if for_entries else [r.email for r in rows]
        fixt = fixtures.load_nlp_rows(fkeys, method=method)
        if fixt:
            if for_entries:
                clean_txt, embeds, titles, texts = fixt
            else:
                clean_txt, embeds = fixt
        else:
            clean_txt = cleantext.keywords(
                paras_flat, postags=['NOUN', 'ADJ', 'VERB', 'PROPN'])
            embeds = nlp_.sentence_encode(paras_flat).tolist()
            if for_entries:
                titles = nlp_.summarization(paras_grouped,
                                            min_length=5,
                                            max_length=20,
                                            with_sentiment=False)
                texts = nlp_.summarization(paras_grouped,
                                           min_length=30,
                                           max_length=250)

        for i, r in enumerate(rows):
            CM = M.CacheEntry if for_entries else M.CacheUser
            c = sess.query(CM).get(r.id)
            if not c:
                c = CM(entry_id=r.id) if for_entries else CM(user_id=r.id)
                sess.add(c)
            # Save the cache_entry (paras,clean,vectors)
            paras = paras_grouped[i]
            c.paras = paras
            ct = len(paras)
            c.clean = [' '.join(e) for e in clean_txt[:ct]]
            c.vectors = embeds[:ct]
            sess.commit()

            # Save the fixture for later
            fixt = (clean_txt[:ct], embeds[:ct], titles[i], texts[i]) \
                if for_entries else (clean_txt[:ct], embeds[:ct])
            fixt_k = r.title if for_entries else r.email
            fixtures.save_nlp_row(fixt_k, fixt, method=method)

            clean_txt, embeds = clean_txt[ct:], embeds[ct:]

            if for_entries:
                r.title_summary = titles[i]["summary"]
                r.text_summary = texts[i]["summary"]
                r.sentiment = texts[i]["sentiment"]
            r.ai_ran = True
            sess.commit()

        if for_entries:
            # 9131155e: only update every x entries
            M.Job.multiple_book_jobs(list(uids))
    return {}