def write_posts_to_db(tracks_df, _p_): # add entries tracks_list = tracks_df.to_dict(orient = 'records') if _p_['tracks_or_reposts'] == 'tracks': for track in tracks_list: t_p = '{0}/{1}'.format(track['user_permalink'], track['track_permalink']) login_mongo('auxsauce')[_p_['tracks_or_reposts']].update_one({'_id': t_p}, update = {'$addToSet': {'updates': track}, '$set': {'dt_last_update': int(time()), 'current': track}}, upsert = True) else: print('havent gotten here yet'); exit()
def add_to_drop_list_and_remove(perma): # add to drop list login_mongo("auxsauce")['dropped_profiles'].update_one( filter={'permalink': perma}, update={'$set': { 'permalink': perma }}, upsert=True) # remove from all profiles login_mongo("auxsauce")['all_profiles'].delete_one({'permalink': perma})
def delete_column_from_collection(db_name, coll_name, drop_col_list): yn = '0' while yn.lower() != 'yes': yn = input( 'would you like to delete the column(s) "{0}" in collection "{1}/{2}"?: ' .format(str(drop_col_list), db_name, coll_name)) from pandas import DataFrame coll = login_mongo(db_name)[coll_name] if 1 == 0: coll_list = list(coll.find({})) coll_df = DataFrame(coll_list) coll_df.drop(drop_col_list, axis=1, inplace=True) coll_list_new = coll_df.to_dict(orient='records') coll.drop() coll.insert_many(coll_list_new) else: import numpy as np coll_list = list(coll.find({drop_col_list: {'$not': {'$type': 0}}})) print(len(coll_list)) for ii, _profile_ in enumerate(coll_list): print('{0}/{1}'.format(ii, len(coll_list)), _profile_[drop_col_list]) coll.update_one({'_id': _profile_['_id']}, {'$unset': { drop_col_list: '' }})
def check_coll(): from pandas import DataFrame t_coll = login_mongo('auxsauce')['tracks'] t_list = list( t_coll.find( {'_id': '100percentpurerecords/luca-morris-mozzy-rekorder'})) t_df = DataFrame(t_list[0]['updates']) print('\n', t_list, '\n', t_df)
def show_db_stats(): from pandas import DataFrame all_colls = login_mongo('auxsauce').collection_names() # active tracks, total tracks, active profiles, total profiles for coll_name in all_colls: if 1 == 0: if coll_name != 'tracks': continue print('\n\n', coll_name) db = login_mongo('auxsauce') coll = login_mongo('auxsauce')[coll_name] coll_ct = coll.count() coll_mb = db.command({ 'collstats': coll_name, 'scale': 1024 * 1024 })['size'] # check the last update time if coll_name == 'all_profiles': coll_list = list( coll.find({'dt_updated': { '$gt': 1551956071 }}, {'_id': 1})) coll_df = DataFrame(list(DataFrame(coll_list)['_id'].values)) print(coll_df.shape) # check the song-update frequency if coll_name == 'tracks' and 0 == 1: from pandas import to_datetime coll_list = list(coll.find({}).limit(5)) coll_df = DataFrame(coll_list) coll_np = DataFrame(coll_list) print(coll_df) exit() if 1 == 0: coll_df['dt'] = to_datetime(coll_df['_id'], unit='s') coll_df['delta'] = (coll_df['_id'] - coll_df['_id'].shift(1)).fillna(0).astype(int) mean_list = [{ 'past {0} mean update time (s)'.format(_t_): round(coll_df['delta'].iloc[-_t_:].mean(), 2) } for _t_ in [1, 2, 5, 10]] print(mean_list) exit() print(coll_ct, coll_mb)
def delete_collection(db, collection): yn = '0' while yn.lower() != 'yes': yn = input( 'would you like to delete the collection "{0}/{1}"?: '.format( db, collection)) coll = login_mongo(db)[collection] coll.drop()
def create_constant_column_in_collection(db, collection, col_name_f, constant): from pandas import DataFrame coll_cursor = login_mongo(db)[collection] coll_list = list(coll_cursor.find({})) coll_df = DataFrame(coll_list) coll_df[col_name_f] = constant coll_list_new = coll_df.to_dict(orient='records') coll_cursor.drop() coll_cursor.insert_many(coll_list_new)
def check_coll_arch(): from pandas import DataFrame t_coll = login_mongo('auxsauce')['active_tracks_archive'] #t_list = list(t_coll.find({'_id': '100percentpurerecords/luca-morris-mozzy-rekorder'})) t_list = list(t_coll.find({})) if len(t_list) == 0: print('search results are empty') print((t_list)) exit() t_df = DataFrame(t_list[0]['updates']) print('\n', t_list, '\n', t_df)
def pull_active_profiles(_p_): from pandas import DataFrame from time import time drop_list = ['choraltracks', 'family-stations-inc', 'radio_rural', 'feiyr', 'rinsefm', 'thismorningshow', 'kcrw', 'officialsxsw', 'meditations'] cursor = list(login_mongo('auxsauce')['active_profiles_{0}'.format(_p_['tracks_or_reposts'])].find({})) _ap_df_ = DataFrame(cursor) _ap_df_ = _ap_df_[~_ap_df_['permalink'].isin(drop_list)] # sort the dataframe return _ap_df_.sort_values('dt_last_accessed')
def filter_coll(db, collection): from pandas import DataFrame coll_cursor = login_mongo(db)[collection] # filter out conditions before_size = coll_cursor.count() #coll_cursor.delete_many({'$lt': {'past_year_track_ct': 0, 'past_30d_repost_ct': 5}}) coll_cursor.delete_many({ '$and': [{ 'current.past_year_track_ct': { '$eq': 0 } }, { 'current.past_30d_repost_ct': { '$lte': 5 } }] }) after_size = coll_cursor.count() print('before size: {0} after size: {1}'.format( before_size, after_size))
if _reposts_ != []: rep_coll = _aux_db_['reposts'] rep_coll.update_one(filter={'_id': _prof_['_id']}, update={ '$set': { 'dt_last_update': int(time()), 'reposts': _reposts_ } }, upsert=True) while True: # load collection t = time() aux_db = login_mongo('auxsauce') pc_coll = aux_db['profiles_current'] pu_coll = aux_db['profiles_updates'] if 1 == 0: reposts_db_list = list(aux_db['reposts'].find({'_id': 'therealstyme'}, { '_id': 0, 'reposts': 1 })) rdbl = reposts_db_list[0]['reposts'][0] print(DataFrame(rdbl)) print(len(rdbl)) print(type(rdbl)) exit()
min_access_profiles_df = profiles_df[profiles_df['access_ct'] <= min_access_ct] print(min_access_profiles_df.shape, min_access_ct) least_recent_list = sorted(min_access_profiles_df.to_dict('records'), key = itemgetter('permalink')) # shuffle list from random import shuffle shuffle(least_recent_list) for ct, profile in enumerate(least_recent_list[: 1000]): print('\n\n{0}/{1}'.format(ct, len(least_recent_list)), profile['permalink']) t = time() params['loop_start'] = time() # get the df t = time() ap_tr_coll = login_mongo('auxsauce')['active_profiles_{0}'.format(t_r)].find({'permalink': profile['permalink']}) ap_tr_list = list(ap_tr_coll) try: prof_access_ct = ap_tr_list[0]['access_ct'] except IndexError: print('index error'); print(ap_tr_list, profile); exit() print('get cursors: {0:.4f} s'.format(time() - t)) profiles_remaining_ct = login_mongo('auxsauce')['active_profiles_{0}'.format(params['tracks_or_reposts'])].find({'access_ct': min_access_ct}).count() if profiles_remaining_ct == 0: exit() if min_access_ct != prof_access_ct: print('SKIPPING: {0}\n\n'.format(profile['permalink'])); continue # write the access dt to the active profile db coll (currently i have it saved so that profs are in a list) t = time() write_profile_access_dt(profile, params) print('write profile access: {0:.4f} s'.format(time() - t)) # scrape the most recent songs from a profile, for now limit to a month
def migrate_db(db_name_from, db_name_to, port_no_from=27018, port_no_to=27017): all_colls = login_mongo(db_name_from).collection_names()
def column_to_columns(column_name_from, coll_name, db_name='auxsauce'): from pandas import DataFrame # load collection coll = login_mongo(db_name)[coll_name]
def migrate_column(column_name_from, column_name_to, coll_name_from, coll_name_to, db_name_from='auxsauce', db_name_to='auxsauce'): from pandas import DataFrame # get all reposts from orig coll from_coll = login_mongo(db_name_from)[coll_name_from] to_coll = login_mongo(db_name_to)[coll_name_to] print('\n\nfrom: ', from_coll, '\nto: ', to_coll) from_coll_ct = int(from_coll.count()) if 1 == 0: all_colls = login_mongo(db_name_from).collection_names() for i in all_colls: print(i) exit() # load records from original collection empty = 0 while empty == 0: if 1 == 0: for i in (list(from_coll.find().limit(1))[0]): print(i) exit() db_from_list = list( from_coll.find( { column_name_from: { '$exists': True }, '_id': { '$regex': '^a' } }, { '_id': 1, column_name_from: 1 }).limit(100)) empty = len(db_from_list) == 0 #print(empty) for ct, profile in enumerate(db_from_list): print(profile['_id'], ' {0}/{1}'.format(ct, len(db_from_list))) #profile = profile['current'] profile_id = profile.pop('_id') # place new field in new col #to_coll.update_one({'_id': profile['_id']}, {'$set': {column_name_to: profile[column_name_to]}}, upsert = True) '''from_coll.update_one({'_id': profile_id, column_name_from: {'$exists': True}}, {'$unset': {column_name_from: True}}) obj = list(from_coll.find({'_id': profile_id}).limit(1)) print(obj) exit()''' to_coll.update_one({'_id': profile_id}, {'$set': profile}, upsert=True) # remove orig field from orig col #from_coll.update_one({'_id': profile['_id'], column_name_from: {'$exists': True}}, {'$unset': {column_name_from: True}}) #from_coll.update_one({'_id': profile_id, column_name_from: {'$exists': True}}, {'$unset': {column_name_from: True}}) #from_coll.update_one({'_id': profile_id, column_name_from: {'$exists': True}}, {'$unset': {column_name_from: True}}) to_coll_ct = int(to_coll.count()) print(to_coll_ct, '/', from_coll_ct, '\n\n')
def rename_collection(orig_coll_name, new_coll_name, db_name='auxsauce'): aux_db = login_mongo(db_name) orig_coll = aux_db[orig_coll_name] orig_coll.rename(new_coll_name)
def delete_profile_by_perma(_profile_): login_mongo("auxsauce")['profiles_current'].delete_one( {'_id': _profile_['_id']}) login_mongo("auxsauce")['profiles_updates'].delete_one( {'_id': _profile_['_id']}) print('deleted profile: {0}'.format(_profile_['_id']))
def write_profile_access_dt(_prof_, _p_): t_r = _p_['tracks_or_reposts'] _prof_['dt_last_accessed'] = int(time()) _prof_['access_ct'] = prof_access_ct + 1 login_mongo('auxsauce')['active_profiles_{0}'.format(t_r)].update_one(filter = {'permalink': _prof_['permalink']}, update = {'$set': {'access_ct': _prof_['access_ct']}})