def pull_tweets(rabbit_ip="localhost", postgres_ip="localhost"): def callback(chn, method, properties, body): publisher = body.decode("utf-8") api = apis.get_twitter() print(f"Publishing a tweets for {publisher}") statii = get_last_tweets(api, publisher, postgres_ip) now = datetime.datetime.utcnow() try: for status in statii: if now - datetime.timedelta(hours=24) < status.created_at: continue chn.basic_publish(exchange="", routing_key="tweets", body=pickle.dumps(status)) except tweepy.error.TweepError as err: # In case reaches maximum number of requests, sleep for 15 min and retry sleep(15 * 60) return callback(chn, method, properties, body) chn.basic_ack(delivery_tag=method.delivery_tag) with queue.connection(rabbit_ip) as conn: channel = conn.channel() channel.queue_declare(queue="tweets", durable=True) channel.basic_consume(queue="publishers", on_message_callback=callback) channel.start_consuming()
def select_publishers(rabbit_ip="localhost", postgres_ip="localhost"): rows = None with database.connection(postgres_ip) as conn: cur = conn.cursor() cur.execute("SELECT screen_name FROM publishers;") rows = cur.fetchall() publishers = [ row[0] for row in rows if row[0] not in ["FoxNews", "foxnewspolitics"] ] with queue.connection(rabbit_ip) as conn: channel = conn.channel() channel.queue_declare(queue="publishers", durable=True) for publisher in publishers: print(f"Publishing publisher: {publisher}") channel.basic_publish(exchange="", routing_key="publishers", body=publisher)
def store(rabbit_ip="localhost", postgres_ip="localhost", batch_size=100): records = [] schema_file = Path(__file__).parent / "tweet.avsc" with open(schema_file) as f: raw_schema = json.load(f) schema = parse_schema(raw_schema) def insert_records(db_conn, records): # except exceptions.DuplicateDBEntryException as err: sql = """INSERT INTO tweets (id, publisher, language, created_at, text, original_screen_name, raw, retweets, favorites) VALUES (%(id)s, %(publisher)s, %(language)s, %(created_at)s, %(text)s, %(original_screen_name)s, %(raw)s, %(retweets)s, %(favorites)s);""" cur = db_conn.cursor() for record in records: try: cur.execute(sql, record) except psycopg2.errors.UniqueViolation as err: db_conn.rollback() cur.close() db_conn.commit() def callback(chn, method, properties, body): stream = io.BytesIO(body) for tweet in reader(stream, schema): records.append(process_tweet(tweet, postgres_ip)) stream.close() if len(records) >= batch_size: print(f"Inserting {len(records)} records") with database.connection(postgres_ip) as db_conn: insert_records(db_conn, records) records.clear() chn.basic_ack(delivery_tag=method.delivery_tag, multiple=True) with queue.connection(rabbit_ip) as conn: channel = conn.channel() channel.queue_declare(queue="records_db", durable=True) channel.queue_bind("records_db", "records_router") channel.basic_consume(queue="records_db", on_message_callback=callback) channel.start_consuming()
def process_tweets(rabbit_ip="localhost"): schema_file = Path(__file__).parent / "tweet.avsc" with open(schema_file) as f: raw_schema = json.load(f) schema = parse_schema(raw_schema) def callback(chn, method, properties, body): tweet = pickle.loads(body) stream = io.BytesIO() writer(stream, schema, [tweet._json]) chn.basic_publish(exchange="records_router", routing_key="records", body=stream.getvalue()) chn.basic_ack(delivery_tag=method.delivery_tag) with queue.connection(rabbit_ip) as conn: channel = conn.channel() channel.exchange_declare("records_router", exchange_type="fanout", durable=True) channel.queue_declare(queue="records") channel.basic_consume(queue="tweets", on_message_callback=callback) channel.start_consuming()
def upload(rabbit_ip="localhost", batch_size=100): records = [] schema_file = Path(__file__).parent / "tweet.avsc" with open(schema_file) as f: raw_schema = json.load(f) schema = parse_schema(raw_schema) aws_resource = apis.get_aws() bucket = aws_resource.Bucket("media-analyzer-store") def upload_records(records): stream = io.BytesIO() writer(stream, schema, records) stream.seek(0) file_name = datetime.now().strftime('%Y%m-%d%H-%M%S-') + str( uuid4()) + ".avro" bucket.upload_fileobj(stream, file_name) stream.close() def callback(chn, method, properties, body): stream = io.BytesIO(body) for tweet in reader(stream, schema): records.append(tweet) stream.close() if len(records) >= batch_size: print(f"Inserting {len(records)} records") upload_records(records) records.clear() chn.basic_ack(delivery_tag=method.delivery_tag, multiple=True) with queue.connection(rabbit_ip) as conn: channel = conn.channel() channel.queue_declare(queue="records_datalake", durable=True) channel.queue_bind("records_datalake", "records_router") channel.basic_consume(queue="records_datalake", on_message_callback=callback) channel.start_consuming()
def channel(): with queue.connection() as conn: channel = conn.channel() channel.queue_declare(queue="test") yield channel channel.queue_delete(queue="test")
def test_queue_is_up(): with queue.connection() as conn: pass