def test_find_truffles(): posts = create_n_random_posts(300) post_frame = pd.DataFrame(posts) regressor_kwargs = dict(n_estimators=20, max_leaf_nodes=100, max_features=0.1, n_jobs=-1, verbose=1, random_state=42) topic_kwargs = dict(num_topics=50, no_below=5, no_above=0.7) post_frame = tppp.preprocess(post_frame, ncores=4, chunksize=50) pipeline = tpmo.train_pipeline(post_frame, topic_kwargs=topic_kwargs, regressor_kwargs=regressor_kwargs) posts = create_n_random_posts(50) post_frame = pd.DataFrame(posts) post_frame = tppp.preprocess(post_frame, ncores=4, chunksize=50) truffles = tpmo.find_truffles(post_frame, pipeline, account='aa') assert truffles.iloc[0].rank_score == truffles.rank_score.max()
def main(): """Main loop started from command line""" no_broadcast, current_datetime = parse_args() if current_datetime is None: current_datetime = pd.datetime.utcnow() else: current_datetime = pd.to_datetime(current_datetime) data_directory = os.path.join(config.PROJECT_DIRECTORY, 'scraped_data') model_directoy = os.path.join(config.PROJECT_DIRECTORY, 'trained_models') log_directory = os.path.join(config.PROJECT_DIRECTORY, 'logs') configure_logging(log_directory, current_datetime) logger.info('STARTING main script at {}'.format(current_datetime)) if no_broadcast: logger.info('Run without broadcasting.') else: logger.info('ATTENTION I WILL BROADCAST TO STEEMIT!!!') time.sleep(2) steem = MPSteem(nodes=config.NODES, no_broadcast=no_broadcast) # To post stuff account = config.ACCOUNT poster = Poster(account=account, steem=steem) prediction_frame = tpgd.scrape_hour_data(steem=steem, current_datetime=current_datetime, ncores=32, offset_hours=2) prediction_frame = tppp.preprocess(prediction_frame, ncores=8) permalink = 'daily-truffle-picks-2018-03-27' overview_permalink = 'weekly-truffle-updates-2018-12' logger.info('Computing the top trending without bidbots') logger.info('Searching for bid bots and bought votes') min_datetime = prediction_frame.created.min() max_datetime = prediction_frame.created.max() + pd.Timedelta(days=1) upvote_payments, bots = tpad.get_upvote_payments_to_bots(steem=steem, min_datetime=min_datetime, max_datetime=max_datetime) logger.info('Adjusting votes and reward') sorted_frame = tppp.compute_bidbot_correction(post_frame=prediction_frame, upvote_payments=upvote_payments) tt0b.create_trending_post(sorted_frame, upvote_payments=upvote_payments, poster=poster, topN_permalink=permalink, overview_permalink=overview_permalink, current_datetime=current_datetime, bots=bots) logger.info('DONE at {}'.format(current_datetime))
def test_test_all_top_with_real_data(steem): if config.PASSWORD: steem.wallet.unlock(config.PASSWORD) poster = Poster(steem=steem, account=config.ACCOUNT, waiting_time=0.1, no_posting_key_mode=config.PASSWORD is None) df = tpbg.scrape_hour_data(steem, stop_after=10) df = tppp.preprocess(df) df['predicted_reward'] = df.reward df['predicted_votes'] = df.votes date = pd.datetime.utcnow().date() account = config.ACCOUNT permalink = tbpd.post_topN_list(df, poster, date, overview_permalink='jjj') tbpd.comment_on_own_top_list(df, poster, permalink) tbpd.vote_and_comment_on_topK(df, poster, permalink, K=1, overview_permalink='jjj')
def test_test_top_trending_post(steem): if config.PASSWORD: steem.wallet.unlock(config.PASSWORD) poster = Poster(steem=steem, account=config.ACCOUNT, waiting_time=0.1, no_posting_key_mode=config.PASSWORD is None) posts = random_data.create_n_random_posts(10) df = pd.DataFrame(posts) df['reward'] = df.reward df['predicted_votes'] = df.votes df = tppp.preprocess(df, ncores=1) date = pd.datetime.utcnow().date() tbpd.post_top_trending_list(df, poster, date, overview_permalink='iii', trufflepicks_permalink='kkk', steem_amount=10, sbd_amount=10)
def test_create_trending_post(noapisteem): current_datetime = pd.datetime.utcnow() data_frame = tpgd.scrape_hour_data(steem=noapisteem, current_datetime=current_datetime, ncores=32, offset_hours=8, hours=1, stop_after=20) min_datetime = data_frame.created.min() max_datetime = data_frame.created.max() + pd.Timedelta(days=8) upvote_payments, bots = tpad.get_upvote_payments_to_bots( steem=noapisteem, min_datetime=min_datetime, max_datetime=max_datetime, bots=['booster']) data_frame = tppp.preprocess(data_frame, ncores=1) data_frame = tppp.compute_bidbot_correction( post_frame=data_frame, upvote_payments=upvote_payments) account = config.ACCOUNT poster = Poster(account=account, steem=noapisteem, no_posting_key_mode=config.PASSWORD is None) tt0b.create_trending_post(data_frame, upvote_payments, poster, 'test', 'test', current_datetime, bots=bots)
def test_weekly_post(steem): posts = create_n_random_posts(300) post_frame = pd.DataFrame(posts) current_date = pd.datetime.utcnow() regressor_kwargs = dict(n_estimators=20, max_leaf_nodes=100, max_features=0.1, n_jobs=-1, verbose=1, random_state=42) topic_kwargs = dict(num_topics=50, no_below=5, no_above=0.7) post_frame = tppp.preprocess(post_frame, ncores=4, chunksize=50) pipeline = tpmo.train_pipeline(post_frame, topic_kwargs=topic_kwargs, regressor_kwargs=regressor_kwargs) post_frame['steem_bought_reward'] = 0 post_frame['sbd_bought_reward'] = 0 post_frame['bought_votes'] = 0 poster = Poster(account=config.ACCOUNT, steem=steem, no_posting_key_mode=config.PASSWORD is None) permalink = tppw.post_weakly_update(pipeline, post_frame, poster=poster, current_datetime=current_date) assert permalink
def test_crossval(): posts = create_n_random_posts(100) post_frame = pd.DataFrame(posts) regressor_kwargs = dict(n_estimators=20, max_leaf_nodes=100, max_features=0.1, n_jobs=-1, verbose=1, random_state=42) topic_kwargs = dict(num_topics=50, no_below=5, no_above=0.7) post_frame = tppp.preprocess(post_frame, ncores=4, chunksize=20) param_grid = { 'feature_generation__topic_model__no_above': [0.2, 0.3], 'regressor__max_leaf_nodes': [50, 100], } tpmo.cross_validate(post_frame, param_grid, topic_kwargs=topic_kwargs, regressor_kwargs=regressor_kwargs)
def test_statistics(): posts = create_n_random_posts(300) post_frame = pd.DataFrame(posts) current_date = pd.datetime.utcnow() regressor_kwargs = dict(n_estimators=20, max_leaf_nodes=100, max_features=0.1, n_jobs=-1, verbose=1, random_state=42) topic_kwargs = dict(num_topics=50, no_below=5, no_above=0.7) post_frame = tppp.preprocess(post_frame, ncores=4, chunksize=50) pipeline = tpmo.train_pipeline(post_frame, topic_kwargs=topic_kwargs, regressor_kwargs=regressor_kwargs) post_frame['steem_bought_reward'] = 0 post_frame['sbd_bought_reward'] = 0 post_frame['bought_votes'] = 0 stats = tppw.compute_weekly_statistics(post_frame, pipeline) steem_per_mvests = 490 delegator_list = ['peter', 'paul'] title, body = tpbp.weekly_update(steem_per_mvests=steem_per_mvests, current_datetime=current_date, delegator_list=delegator_list, **stats) assert title assert body
def test_find_truffles_with_real_data(steem): df = tpbg.scrape_hour_data(steem, stop_after=20) df = tppp.preprocess(df) sorted = tpmo.find_truffles(df, MockPipeline()) assert sorted.rank_score.iloc[0] == sorted.rank_score.max()
def test_preprocessing(): post_frame = pd.DataFrame(POSTS) filtered = tppp.preprocess(post_frame, ncores=1, min_en_prob=0.5, max_errors_per_word=0.5, min_max_num_words=(10, 99999)) assert len(filtered)
def large_mp_preprocess(log_directory, current_datetime, steem, data_directory, days, offset_days): """Helper function to spawn in child process""" configure_logging(log_directory, current_datetime) post_frame = tpgd.load_or_scrape_training_data(steem, data_directory, current_datetime=current_datetime, days=days, offset_days=offset_days, ncores=32) return tppp.preprocess(post_frame, ncores=8)
def test_store_load_frame_test(temp_dir): filename = os.path.join(temp_dir, 'test.sqlite') x = pd.DataFrame(create_n_random_posts(42)) x = tppp.preprocess(x) tppe.to_sqlite(x, filename, 'test') y = tppe.from_sqlite(filename, 'test') pd.testing.assert_frame_equal(x,y)
def test_preprocessing_random_parallel(): posts = create_n_random_posts(50) post_frame = pd.DataFrame(posts) filtered = tppp.preprocess(post_frame, ncores=5, chunksize=10, min_en_prob=0.5, max_errors_per_word=0.5, min_max_num_words=(10, 99999)) assert len(filtered) > 20
def test_preprocessing_parallel(): post_frame = pd.DataFrame([POSTS[0] for _ in range(100)]) post_frame['permalink'] = ['kkk' + str(irun % 50) for irun in range(100)] filtered = tppp.preprocess(post_frame, ncores=5, chunksize=20, min_en_prob=0.5, max_errors_per_word=0.5, min_max_num_words=(10, 99999)) assert len(filtered) > 40
def test_tag_measure(): posts = create_n_random_posts(100) post_frame = pd.DataFrame(posts) post_frame = tppp.preprocess(post_frame, ncores=1) post_frame['predicted_reward'] = post_frame.reward tag_measure = tpmo.compute_tag_factor(post_frame.tags, tpmo.PUNISH_LIST) assert np.all(tag_measure > 0)
def test_topN_comment(): posts = random_data.create_n_random_posts(25) df = pd.DataFrame(posts) df = tppp.preprocess(df, ncores=1) post = tbpo.topN_comment(topN_authors=df.author, topN_permalinks=df.permalink, topN_titles=df.title, topN_votes=df.votes, topN_rewards=df.reward) assert post
def test_Doc2Vec_KNN(): posts = create_n_random_posts(100) post_frame = pd.DataFrame(posts) post_frame = tppp.preprocess(post_frame, ncores=4, chunksize=30) pipe = tpmo.create_pure_doc2vec_pipeline(dict(epochs=2, size=16)) pipe, frame = tpmo.train_test_pipeline(post_frame, pipeline=pipe, sample_weight_function=None) pass
def test_filtered_body_no_images_regression(steem): """ Test for error in filtering as in these quotes: https://steemit.com/steemit/@trufflepig/daily-truffle-picks-2018-03-31""" posts = tpgd.get_post_data([ ('colovhis', 'dofus-mastodon-cemetery-basic-tutorial'), ('joshuaetim', 'rewarding-hardwork-and-excellence-amongst-school-children-' 'through-steem-powered-notebooks-and-writing-materials-3rd-phase-of') ], steem=steem) post_frame = pd.DataFrame(posts) post_frame = tppp.preprocess(post_frame, ncores=1) assert not any(post_frame.filtered_body.apply(lambda x: '.JPG' in x))
def test_train_test_pipeline(): posts = create_n_random_posts(300) post_frame = pd.DataFrame(posts) regressor_kwargs = dict(n_estimators=20, max_leaf_nodes=100, max_features=0.1, n_jobs=-1, verbose=1, random_state=42) topic_kwargs = dict(num_topics=50, no_below=5, no_above=0.7) post_frame = tppp.preprocess(post_frame, ncores=4, chunksize=50) tpmo.train_test_pipeline(post_frame, topic_kwargs=topic_kwargs, regressor_kwargs=regressor_kwargs)
def execute_call(comment_authors_and_permalinks, poster, pipeline, topN_permalink, max_comments, overview_permalink): """Executes the pig on duty call""" ncomments = len(comment_authors_and_permalinks) logger.info('Found {} comments mentioning {}'.format( ncomments, poster.account)) if ncomments > max_comments: logger.info('To many comments, reducing to {}'.format(max_comments)) comment_authors_and_permalinks = comment_authors_and_permalinks[: max_comments] posts = tpco.get_parent_posts(comment_authors_and_permalinks, poster.steem) initial_frame = pd.DataFrame(posts) post_frame = initial_frame.copy() post_frame = tppp.preprocess(post_frame, ncores=4) if len(post_frame): truffle_frame = tpmo.find_truffles(post_frame, pipeline, k=0, account='', add_rank_score=False) truffle_frame['passed'] = True else: truffle_frame = pd.DataFrame() filtered_posts = initial_frame[~initial_frame.index.isin(truffle_frame. index)] filtered_posts['passed'] = False combined = pd.concat([truffle_frame, filtered_posts], axis=0) topN_link = 'https://steemit.com/@{author}/{permalink}'.format( author=poster.account, permalink=topN_permalink) tpoc.post_on_call(combined, poster=poster, topN_link=topN_link, overview_permalink=overview_permalink)
def test_topN_post(): posts = random_data.create_n_random_posts(10) df = pd.DataFrame(posts) df = tppp.preprocess(df, ncores=1) date = pd.datetime.utcnow().date() df.image_urls = df.body.apply(lambda x: tptf.get_image_urls(x)) title, post = tbpo.topN_post(topN_authors=df.author, topN_permalinks=df.permalink, topN_titles=df.title, topN_filtered_bodies=df.filtered_body, topN_image_urls=df.image_urls, topN_rewards=df.reward, topN_votes=df.votes, title_date=date, truffle_link='de.de.de') assert post assert title
def test_test_top20_vote_and_comment(steem): if config.PASSWORD: steem.wallet.unlock(config.PASSWORD) poster = Poster(steem=steem, account=config.ACCOUNT, waiting_time=0.1, no_posting_key_mode=config.PASSWORD is None) posts = random_data.create_n_random_posts(10) df = pd.DataFrame(posts) df['predicted_reward'] = df.reward df['predicted_votes'] = df.votes df = tppp.preprocess(df, ncores=1) tbpd.vote_and_comment_on_topK(df, poster, 'laida', overview_permalink='lll')
def test_test_top10post(steem): if config.PASSWORD: steem.wallet.unlock(config.PASSWORD) poster = Poster(steem=steem, account=config.ACCOUNT, waiting_time=0.1, no_posting_key_mode=config.PASSWORD is None) posts = random_data.create_n_random_posts(10) df = pd.DataFrame(posts) df['predicted_reward'] = df.reward df['predicted_votes'] = df.votes df = tppp.preprocess(df, ncores=1) date = pd.datetime.utcnow().date() account = config.ACCOUNT permalink = tbpd.post_topN_list(df, poster, date, overview_permalink='iii') tbpd.comment_on_own_top_list(df, poster, permalink)
def test_load_or_train(temp_dir): cdt = pd.datetime.utcnow() posts = create_n_random_posts(300) post_frame = pd.DataFrame(posts) regressor_kwargs = dict(n_estimators=20, max_leaf_nodes=100, max_features=0.1, n_jobs=-1, verbose=1, random_state=42) topic_kwargs = dict(num_topics=50, no_below=5, no_above=0.7) post_frame = tppp.preprocess(post_frame, ncores=4, chunksize=50) pipe = tpmo.load_or_train_pipeline(post_frame, temp_dir, current_datetime=cdt, topic_kwargs=topic_kwargs, regressor_kwargs=regressor_kwargs) topic_model = pipe.named_steps['feature_generation'].transformer_list[1][1] result = topic_model.print_topics() assert result assert len(os.listdir(temp_dir)) == 1 pipe2 = tpmo.load_or_train_pipeline(post_frame, temp_dir, current_datetime=cdt, topic_kwargs=topic_kwargs, regressor_kwargs=regressor_kwargs) assert len(os.listdir(temp_dir)) == 1 assert set(pipe.named_steps.keys()) == set(pipe2.named_steps.keys())
def main(): logging.basicConfig(level=logging.INFO) author, permalink, current_datetime = parse_args() if current_datetime is None: current_datetime = pd.datetime.utcnow() else: current_datetime = pd.to_datetime(current_datetime) model_directoy = os.path.join(config.PROJECT_DIRECTORY, 'trained_models') pipeline = tpmo.load_or_train_pipeline(None, model_directoy, current_datetime) steem = MPSteem(nodes=config.NODES, no_broadcast=True) posts = tpgd.get_post_data([(author, permalink)], steem, {}) posts = pd.DataFrame(posts) post_frame = tppp.preprocess(posts) tpmo.find_truffles(post_frame, pipeline)
def main(): """Main loop started from command line""" no_broadcast, current_datetime = parse_args() if current_datetime is None: current_datetime = pd.datetime.utcnow() else: current_datetime = pd.to_datetime(current_datetime) data_directory = os.path.join(config.PROJECT_DIRECTORY, 'scraped_data') model_directoy = os.path.join(config.PROJECT_DIRECTORY, 'trained_models') log_directory = os.path.join(config.PROJECT_DIRECTORY, 'logs') configure_logging(log_directory, current_datetime) logger.info('STARTING main script at {}'.format(current_datetime)) if no_broadcast: logger.info('Run without broadcasting.') else: logger.info('ATTENTION I WILL BROADCAST TO STEEMIT!!!') time.sleep(2) steem = MPSteem(nodes=config.NODES, no_broadcast=no_broadcast) # hack to allow for payments, because of https://github.com/steemit/steem-python/issues/191 noapisteem = MPSteem(nodes=config.NODES[1:], no_broadcast=no_broadcast) # To post stuff account = config.ACCOUNT poster = Poster(account=account, steem=noapisteem) tppd.create_wallet(steem, config.PASSWORD, posting_key=config.POSTING_KEY, active_key=config.ACTIVE_KEY) logger.info('Paying out investors') tpde.pay_delegates( account=account, steem=noapisteem, # use a steem instance without api.steem! current_datetime=current_datetime) if not tpmo.model_exists(current_datetime, model_directoy): post_frame = load_and_preprocess_2_frames( log_directory=log_directory, current_datetime=current_datetime, steem=steem, noapisteem=noapisteem, data_directory=data_directory) logger.info('Garbage collecting') gc.collect() else: post_frame = None regressor_kwargs = dict(n_estimators=256, max_leaf_nodes=5000, max_features=0.2, n_jobs=-1, verbose=1, random_state=42) topic_kwargs = dict(num_topics=128, no_below=7, no_above=0.1, ngrams=(1, 2), keep_n=333000) if post_frame is not None and len(post_frame) > MAX_DOCUMENTS: logger.info('Frame has {} Documents, too many, ' 'reducing to {}'.format(len(post_frame), MAX_DOCUMENTS)) post_frame.sort_values('created', inplace=True, ascending=False) train_frame = post_frame.iloc[:MAX_DOCUMENTS, :] else: train_frame = post_frame pipeline = tpmo.load_or_train_pipeline( train_frame, model_directoy, current_datetime, regressor_kwargs=regressor_kwargs, topic_kwargs=topic_kwargs, targets=['adjusted_reward', 'adjusted_votes']) tpmo.log_pipeline_info(pipeline=pipeline) overview_permalink = tppw.return_overview_permalink_if_exists( account=account, current_datetime=current_datetime, steem=steem) if not overview_permalink: if post_frame is None: logger.info('Need to reaload data for weekly overview') post_frame = load_and_preprocess_2_frames( log_directory=log_directory, current_datetime=current_datetime, steem=steem, noapisteem=noapisteem, data_directory=data_directory) logger.info('I want to post my weekly overview') overview_permalink = tppw.post_weakly_update( pipeline=pipeline, post_frame=post_frame, poster=poster, current_datetime=current_datetime) logger.info('Garbage collecting') del post_frame gc.collect() prediction_frame = tpgd.scrape_hour_data(steem=steem, current_datetime=current_datetime, ncores=32, offset_hours=2) prediction_frame = tppp.preprocess(prediction_frame, ncores=8) sorted_frame = tpmo.find_truffles(prediction_frame, pipeline, account=account) permalink = tppd.post_topN_list(sorted_frame, poster=poster, current_datetime=current_datetime, overview_permalink=overview_permalink) tppd.comment_on_own_top_list(sorted_frame, poster=poster, topN_permalink=permalink) tppd.vote_and_comment_on_topK(sorted_frame, poster=poster, topN_permalink=permalink, overview_permalink=overview_permalink) logger.info('Computing the top trending without bidbots') logger.info('Searching for bid bots and bought votes') min_datetime = sorted_frame.created.min() max_datetime = sorted_frame.created.max() + pd.Timedelta(days=1) upvote_payments, bots = tpad.get_upvote_payments_to_bots( steem=noapisteem, min_datetime=min_datetime, max_datetime=max_datetime) logger.info('Adjusting votes and reward') sorted_frame = tppp.compute_bidbot_correction( post_frame=sorted_frame, upvote_payments=upvote_payments) tt0b.create_trending_post(sorted_frame, upvote_payments=upvote_payments, poster=poster, topN_permalink=permalink, overview_permalink=overview_permalink, current_datetime=current_datetime, bots=bots) logger.info('Done with normal duty, answering manual calls!') tfod.call_a_pig(poster=poster, pipeline=pipeline, topN_permalink=permalink, current_datetime=current_datetime, offset_hours=2, hours=24, overview_permalink=overview_permalink) logger.info('Cleaning up after myself') tfut.clean_up_directory(model_directoy, keep_last=3) tfut.clean_up_directory(data_directory, keep_last=25) tfut.clean_up_directory(log_directory, keep_last=14) logger.info('Preloading -8 days for later training') tpgd.load_or_scrape_training_data(steem, data_directory, current_datetime=current_datetime, days=1, offset_days=8, ncores=32) logger.info('DONE at {}'.format(current_datetime))