def get_users(engine=None): if not engine: engine = db.get_db_engine() all_users = pd.read_sql('SELECT DISTINCT user_id, username FROM tweets;', con=engine) my_logger.info(f"Found {len(all_users)} distinct users") return all_users
def main(): c = twitter.make_config() queries = get_queries() gaps = get_date_gaps() engine = db.get_db_engine() for query in queries: name, q = query dates = gaps[gaps['name'] == name] if len(dates) == 0: continue c.Search = q for start_date, end_date in zip(dates['start'], dates['end']): for d1, d2 in utils.date_range(start_date, end_date, step=1): c.Since = d1 c.Until = d2 # Running the search for i in range(15): try: my_logger.info(f'{name}:{d1}:Attempt {i + 1}') run_search(c, name, engine, d1) break except (TimeoutError, ClientError, TweetError, async_TimeoutError) as e: msg = f'{name}:{d1}:{e}' my_logger.error(msg) time.sleep(2) n_tweets_total = db.count_tweets(where=f"name = '{name}'") my_logger.info( f'TOTAL OF {n_tweets_total} TWEETS DOWNLOADED FOR {name}')
def create_users_table(engine=None, drop=False): if not engine: engine = db.get_db_engine() if drop: engine.execute('''DROP TABLE IF EXISTS users;''') engine.execute(''' CREATE TABLE users( id INT PRIMARY KEY, username VARCHAR(20) UNIQUE, followers LONGTEXT, following LONGTEXT ); ''')
def main(): engine = db.get_db_engine() df_users = get_users(engine) df_users = df_users.rename(columns={"user_id": "id"}) df_users.to_sql('users', con=engine, if_exists='append', index=False)
import pandas as pd from twitpol import config, utils, db processed_dir = config.PROCESSED engine = db.get_db_engine() data = pd.read_sql('SELECT * FROM TWEETS LIMIT 100', con=engine) # `processed_dir` is not a string, but a POSIXPath object. # This doesn't change much for our purposes, except it allows you to do path # concatenation for windows or mac with this neat backslash syntax, rather # than typing os.path.join(...) every time you want to build a new path. Yay # platform-independence! data.to_csv(processed_dir / 'example_data.csv')