def parallel_migration(coastal_counties_db, tweet_db_file): current_coastal_counties_db_file = os.path.join( RESULTS_PATH, os.path.split(tweet_db_file)[1]) current_coastal_counties_db = Database(current_coastal_counties_db_file) coastal_counties_tweets_table = current_coastal_counties_db.create_table( *COASTAL_COUNTIES_TWEETS_TABLE) joined_rows, other_db_name = coastal_counties_db.ijoin( (tweet_db_file, 'other_db', 'tweets'), FIELDS_TO_SELECT_FOR_JOIN + ',counties.fips', MATCH_CRITERIA_FOR_JOIN) current_coastal_counties_db.cursor.execute('BEGIN') current_coastal_counties_db.insert("""INSERT INTO {} VALUES(?,?,?,?,?)""".format(coastal_counties_tweets_table), joined_rows, many=True) current_coastal_counties_db.connection.commit() print "\tDetaching database..." coastal_counties_db.cursor.execute( """DETACH DATABASE '{}'""".format(other_db_name)) current_coastal_counties_db.connection.close()
def process_db_file(user_chunks, tweet_db_file): mover_tweets_db = Database( os.path.join(MOVERS_TWEETS_DB_PATH, tweet_db_file.split('/')[-1])) mover_tweets_tb = mover_tweets_db.create_table(TWEETS_TBNAME, TWEETS_COLUMNS) mover_tweets_db.cursor.execute('BEGIN') current_tweets_db = Database(tweet_db_file) s = time.time() for i, user_chunk in enumerate(user_chunks): if int(ceil((float(i) / len(user_chunks)) * 100)) % 25 == 0 or i == len(user_chunks) - 1: print "\n\tProcessing chunk {} out of {}".format( i + 1, len(user_chunks)) process_user_chunk(user_chunk, current_tweets_db, mover_tweets_db) current_tweets_db.connection.close() mover_tweets_db.connection.commit() mover_tweets_db.connection.close() print '\n\tElapsed Time for db file: {}s\n'.format( round(time.time() - s, 2))
def process_single_db(db_file): db = Database(db_file) users_fips_tb = db.create_table(USER_FIPS_TBNAME, USER_FIPS_COLUMNS) user_chunks = chunkify([user[0] for user in get_users_in_db(db_file)], n=10000) process_user_chunks(user_chunks, db)
def process_users_chunk(chunk, stats_db_file): db = Database(stats_db_file) chunk_dbfile = os.path.join(DISTRIBUTED_TWEET_STATS_USER_PATH, 'users_{}_{}.db'.format(chunk[0], chunk[-1])) chunk_db = Database(chunk_dbfile) stats_tb = chunk_db.create_table('statistics', TWEET_STATS_COLUMNS) chunk_db.cursor.execute('BEGIN') for uid in chunk: if os.path.getsize(chunk_dbfile.replace('.db', '.db-shm')) >= 1e8: print "{} is larger than 100MB".format(chunk_dbfile) break results = db.select( 'SELECT * FROM statistics WHERE user_id={}'.format(uid)) chunk_db.insert( 'INSERT INTO {} VALUES (?, ?, ?, ?, ?)'.format(stats_tb), results, many=True) chunk_db.connection.commit() chunk_db.connection.close() db.connection.close()
def process_fips_chunk(chunk, stats_db_file): db = Database(stats_db_file) chunk_dbfile = os.path.join(DISTRIBUTED_TWEET_STATS_PATH, 'fips_{}_{}.db'.format(chunk[0], chunk[-1])) chunk_db = Database(chunk_dbfile) stats_tb = chunk_db.create_table('statistics', TWEET_STATS_COLUMNS) chunk_db.cursor.execute('BEGIN') for fips in chunk: results = db.select( 'SELECT * FROM statistics WHERE fips={}'.format(fips)) chunk_db.insert( 'INSERT INTO {} VALUES (?, ?, ?, ?, ?)'.format(stats_tb), results, many=True) chunk_db.connection.commit() chunk_db.connection.close() db.connection.close()
if TEST_SIZE: unique_users = [ user[0] for user in chronology_db.select( 'SELECT DISTINCT user_id FROM user_fips LIMIT {}'.format( TEST_SIZE)) ] else: unique_users = [ user[0] for user in chronology_db.select( 'SELECT DISTINCT user_id FROM user_fips') ] print "Number of Unique Users: {}".format(len(unique_users)) chronology_tb = chronology_db.create_table('user_chronology', USER_CHRONOLOGY_COLUMNS) user_chunks = list(chunkify(unique_users, n=100)) for i, user_chunk in enumerate(user_chunks): if int(ceil((float(i) / len(user_chunks)) * 100)) % 25 == 0 or i == len(user_chunks) - 1: print "\nProcessing chunk {} out of {}".format( i + 1, len(user_chunks)) process_user_chunk_chronology(user_chunk, chronology_db, centroids) print '\nElapsed Time: {}s\n'.format(round(time.time() - s, 2)) print 'Size: {}\n'.format(TEST_SIZE if TEST_SIZE else 'All')
title='Choose databases with tweet statistics') county_weather_db_file = fd.askopenfilename( title='Choose database with county weather') if not tweet_stats_db_files and not county_weather_db_file: raise Exception('\nNo databases selected! Goodbye.\n') except Exception as e: print e sys.exit() s = time.time() for i, stats_dbfile in enumerate(tweet_stats_db_files): print "{} out of {} dbs".format(i + 1, len(tweet_stats_db_files)) stats_db = Database(stats_dbfile) stats_db.create_table('aggregated_tweet_data', AGGREGATED_TWEET_DATA_COLUMNS) stats_db.cursor.execute( 'ATTACH "{}" as wdb'.format(county_weather_db_file)) stats_db.cursor.execute('BEGIN') # 'user_id INT, date_text TEXT, fips INT, tmax REAL, prcp REAL, humidity REAL, total_tweets INT, weather_tweets INT,' stats_db.cursor.execute(""" INSERT INTO aggregated_tweet_data SELECT statistics.user_id, statistics.date_text, statistics.fips, weather.tmax, weather.prcp, weather.hmd, statistics.total_tweets, statistics.weather_tweets FROM statistics INNER JOIN
s = time.time() user_chronology_db = Database(user_chronology_db_file) print for i, mover_tweets_db_file in enumerate(mover_tweets_db_files): print "Processing db file {} out of {}".format( i + 1, len(mover_tweets_db_files)) mover_tweets_db = Database(mover_tweets_db_file) weather_info_db = Database( os.path.join(WEATHER_INFO_DB_PATH, mover_tweets_db_file.split('/')[-1])) weather_info_tb = weather_info_db.create_table('tweets', WEATHER_INFO_COLUMNS) ntweets = mover_tweets_db.select( 'SELECT COUNT(user_id) FROM tweets').fetchone()[0] mover_tweets_db.select('SELECT * FROM tweets') weather_info_db.cursor.execute('BEGIN') processed_tweets = 0 s = time.time() while True: if int(ceil((float(processed_tweets) / ntweets) * 100)) % 25 == 0 or processed_tweets == ntweets - 1:
if __name__ == '__main__': try: weather_info_db_files = fd.askopenfilenames( title='Choose databases with weather info tweets') if not weather_info_db_files: raise Exception('\nNo database selected! Goodbye.\n') except Exception as e: print e sys.exit() s = time.time() stats_db = Database(TWEET_STATS_DB) stats_tb = stats_db.create_table(TWEET_STATS_TB, TWEET_STATS_COLUMNS) total_time = 0 # stash, stash_path = load_processed_stash() # get db_files that haven't been processed yet and that are complete # u_weather_info_db_files = filter(lambda f: not is_processed_file(f, stash), weather_info_db_files) # for i, weather_info_db_file in enumerate(weather_info_db_files): for i, weather_info_db_file in enumerate(weather_info_db_files): print "Processing db file {} out of {}".format( i + 1, len(weather_info_db_files)) weather_info_db = Database(weather_info_db_file) unique_users = [
if __name__ == "__main__": try: dbs = fd.askopenfilenames(title='Get databases') if not dbs: raise Exception('\nNo databases selected! Goodbye.\n') except Exception as e: print e sys.exit() aggregated_data_db_path = os.path.join(RESULTS_DIR_PATH, USER_FIPS_DB) aggregated_data_db = Database(aggregated_data_db_path) user_fips_tb = aggregated_data_db.create_table(USER_FIPS_TBNAME, USER_FIPS_COLUMNS_UNIQUE) s = time.time() for db_file in dbs: db = Database(db_file) db.select('SELECT * FROM {tbn}'.format(tbn=USER_FIPS_TBNAME)) current_user_fips = [(user_id, fips, get_date_from_path(db_file)) for user_id, fips in db.cursor] aggregated_data_db.cursor.execute('BEGIN') aggregated_data_db.insert( 'INSERT OR IGNORE INTO {tbn} VALUES (?, ?, ?)'.format( tbn=user_fips_tb),
if not coastal_counties_tweets_db_files: raise Exception('\nNo databases selected! Goodbye.\n') except Exception as e: print e sys.exit() total_time = 0 for i, tweet_db_file in enumerate(coastal_counties_tweets_db_files): print '{} out of {} databases'.format(i + 1, len(coastal_counties_tweets_db_files)) start = time.time() current_tweet_db = Database(tweet_db_file) stats_tb = current_tweet_db.create_table('statistics', STATS_TB_COLUMNS) current_tweet_db.cursor.execute('BEGIN') current_tweet_db.cursor.executescript(INSERT_GROUP_BY_COMMAND.format(tbn=stats_tb)) # create statistics table with groub by selection current_tweet_db.connection.commit() current_tweet_db.connection.close() end = time.time() - start total_time += end print "\tElapsed: {}s\tAverage: {}s".format(round(total_time, 2), round(total_time / (i+1), 2)) print "\nTotal time: {}s".format(round(total_time, 2))
if __name__ == '__main__': try: aggregated_data_db = fd.askopenfilename( title='Choose database with aggregated data') if not aggregated_data_db: raise Exception('\nNo database selected! Goodbye.\n') except Exception as e: print e sys.exit() s = time.time() aggregated_db = Database(aggregated_data_db) potential_movers_tb = aggregated_db.create_table(POTENTIAL_MOVERS_TBNAME, POTENTIAL_MOVERS_COLUMNS) unique_users = [ user[0] for user in aggregated_db.select( 'SELECT DISTINCT user_id FROM {}'.format(USER_FIPS_TBNAME)) ][:TEST_SIZE] user_chunks = [chunk for chunk in chunkify(unique_users, n=10000)] # counter = 0 for i, user_chunk in enumerate(user_chunks): print "User chunk {} out of {}".format(i + 1, len(user_chunks)) aggregated_db.cursor.execute('BEGIN')
try: aggregated_data_db = fd.askopenfilename(title='Choose database with aggregated data') centroids = get_centroids() if not aggregated_data_db: raise Exception('\nNo database selected! Goodbye.\n') except Exception as e: print e sys.exit() s = time.time() aggregated_db = Database(aggregated_data_db) movers_db = Database(MOVERS_DB_NAME) movers_tb = movers_db.create_table(MOVERS_TBNAME, MOVERS_COLUMNS) potential_movers = [user[0] for user in aggregated_db.select('SELECT DISTINCT user_id FROM {}'.format(POTENTIAL_MOVERS_TBNAME))][:TEST_SIZE] user_chunks = chunkify(potential_movers, n=10000) for user_chunk in user_chunks: movers_db.cursor.execute('BEGIN') for user in user_chunk: actual_moves = user_fips_compare(user, aggregated_data_db, centroids) if actual_moves: movers_db.insert('INSERT INTO {tbn} VALUES(?, ?, ?, ?, ?)'.format(tbn=movers_tb), actual_moves, many=True) movers_db.connection.commit()
if not rwls_path: print "\nNo directory selected. Goodbye!\n" sys.exit() if not os.path.exists('./results'): os.makedirs('results') # Connect / Create results database db = Database('results/tree_ring_data.db') # Create all tables species_tb = db.create_table( "species", "species_id NVARCHAR(4) PRIMARY KEY NOT NULL, species_name NVARCHAR(8) NULL" ) sites_tb = db.create_table( "sites", """ site_id NVARCHAR(7) PRIMARY KEY NOT NULL, site_name NVARCHAR(52) NULL, elevation NVARCHAR(5) NULL, latitude REAL NULL, longitude REAL NULL """) # from sites_tb: location NVARCHAR(13) NULL, trees_tb = db.create_table( "trees", """
except Exception as e: print e sys.exit() coastal_counties_db = Database(coastal_counties_db_file) for i, tweet_db_file in enumerate(tweet_db_files): print '{} out of {} databases'.format(i + 1, len(tweet_db_files)) current_coastal_counties_db_file = os.path.join( RESULTS_PATH, os.path.split(tweet_db_file)[1]) current_coastal_counties_db = Database( current_coastal_counties_db_file) coastal_counties_tweets_table = current_coastal_counties_db.create_table( *COASTAL_COUNTIES_TWEETS_TABLE) joined_rows, other_db_name = coastal_counties_db.ijoin( (tweet_db_file, 'other_db', 'tweets'), FIELDS_TO_SELECT_FOR_JOIN + ',{}.fips'.format(CURRENT_COUNTIES_TABLE), MATCH_CRITERIA_FOR_JOIN) current_coastal_counties_db.cursor.execute('BEGIN') current_coastal_counties_db.insert("""INSERT INTO {} VALUES(?,?,?,?,?)""".format(coastal_counties_tweets_table), joined_rows, many=True) current_coastal_counties_db.connection.commit() coastal_counties_db.cursor.execute( """DETACH DATABASE '{}'""".format(other_db_name)) current_coastal_counties_db.connection.close()