Пример #1
0
def pull_tweets(rabbit_ip="localhost", postgres_ip="localhost"):
    def callback(chn, method, properties, body):
        publisher = body.decode("utf-8")
        api = apis.get_twitter()
        print(f"Publishing a tweets for {publisher}")
        statii = get_last_tweets(api, publisher, postgres_ip)
        now = datetime.datetime.utcnow()
        try:
            for status in statii:
                if now - datetime.timedelta(hours=24) < status.created_at:
                    continue
                chn.basic_publish(exchange="",
                                  routing_key="tweets",
                                  body=pickle.dumps(status))
        except tweepy.error.TweepError as err:
            # In case reaches maximum number of requests, sleep for 15 min and retry
            sleep(15 * 60)
            return callback(chn, method, properties, body)
        chn.basic_ack(delivery_tag=method.delivery_tag)

    with queue.connection(rabbit_ip) as conn:
        channel = conn.channel()
        channel.queue_declare(queue="tweets", durable=True)
        channel.basic_consume(queue="publishers", on_message_callback=callback)
        channel.start_consuming()
Пример #2
0
def select_publishers(rabbit_ip="localhost", postgres_ip="localhost"):
    rows = None
    with database.connection(postgres_ip) as conn:
        cur = conn.cursor()
        cur.execute("SELECT screen_name FROM publishers;")
        rows = cur.fetchall()
    publishers = [
        row[0] for row in rows if row[0] not in ["FoxNews", "foxnewspolitics"]
    ]
    with queue.connection(rabbit_ip) as conn:
        channel = conn.channel()
        channel.queue_declare(queue="publishers", durable=True)
        for publisher in publishers:
            print(f"Publishing publisher: {publisher}")
            channel.basic_publish(exchange="",
                                  routing_key="publishers",
                                  body=publisher)
Пример #3
0
def store(rabbit_ip="localhost", postgres_ip="localhost", batch_size=100):
    records = []

    schema_file = Path(__file__).parent / "tweet.avsc"
    with open(schema_file) as f:
        raw_schema = json.load(f)
    schema = parse_schema(raw_schema)

    def insert_records(db_conn, records):
        # except exceptions.DuplicateDBEntryException as err:
        sql = """INSERT INTO tweets (id, publisher, language, created_at, text,
                                     original_screen_name, raw, retweets, favorites)
                 VALUES (%(id)s, %(publisher)s, %(language)s, %(created_at)s,
                         %(text)s, %(original_screen_name)s, %(raw)s, %(retweets)s, %(favorites)s);"""
        cur = db_conn.cursor()
        for record in records:
            try:
                cur.execute(sql, record)
            except psycopg2.errors.UniqueViolation as err:
                db_conn.rollback()
        cur.close()
        db_conn.commit()

    def callback(chn, method, properties, body):
        stream = io.BytesIO(body)
        for tweet in reader(stream, schema):
            records.append(process_tweet(tweet, postgres_ip))
        stream.close()

        if len(records) >= batch_size:
            print(f"Inserting {len(records)} records")
            with database.connection(postgres_ip) as db_conn:
                insert_records(db_conn, records)
            records.clear()
            chn.basic_ack(delivery_tag=method.delivery_tag, multiple=True)

    with queue.connection(rabbit_ip) as conn:
        channel = conn.channel()
        channel.queue_declare(queue="records_db", durable=True)
        channel.queue_bind("records_db", "records_router")
        channel.basic_consume(queue="records_db", on_message_callback=callback)
        channel.start_consuming()
Пример #4
0
def process_tweets(rabbit_ip="localhost"):
    schema_file = Path(__file__).parent / "tweet.avsc"
    with open(schema_file) as f:
        raw_schema = json.load(f)
    schema = parse_schema(raw_schema)

    def callback(chn, method, properties, body):
        tweet = pickle.loads(body)
        stream = io.BytesIO()
        writer(stream, schema, [tweet._json])
        chn.basic_publish(exchange="records_router",
                          routing_key="records",
                          body=stream.getvalue())
        chn.basic_ack(delivery_tag=method.delivery_tag)

    with queue.connection(rabbit_ip) as conn:
        channel = conn.channel()
        channel.exchange_declare("records_router",
                                 exchange_type="fanout",
                                 durable=True)
        channel.queue_declare(queue="records")
        channel.basic_consume(queue="tweets", on_message_callback=callback)
        channel.start_consuming()
Пример #5
0
def upload(rabbit_ip="localhost", batch_size=100):
    records = []
    schema_file = Path(__file__).parent / "tweet.avsc"
    with open(schema_file) as f:
        raw_schema = json.load(f)
    schema = parse_schema(raw_schema)
    aws_resource = apis.get_aws()
    bucket = aws_resource.Bucket("media-analyzer-store")

    def upload_records(records):
        stream = io.BytesIO()
        writer(stream, schema, records)
        stream.seek(0)
        file_name = datetime.now().strftime('%Y%m-%d%H-%M%S-') + str(
            uuid4()) + ".avro"
        bucket.upload_fileobj(stream, file_name)
        stream.close()

    def callback(chn, method, properties, body):
        stream = io.BytesIO(body)
        for tweet in reader(stream, schema):
            records.append(tweet)
        stream.close()

        if len(records) >= batch_size:
            print(f"Inserting {len(records)} records")
            upload_records(records)
            records.clear()
            chn.basic_ack(delivery_tag=method.delivery_tag, multiple=True)

    with queue.connection(rabbit_ip) as conn:
        channel = conn.channel()
        channel.queue_declare(queue="records_datalake", durable=True)
        channel.queue_bind("records_datalake", "records_router")
        channel.basic_consume(queue="records_datalake",
                              on_message_callback=callback)
        channel.start_consuming()
Пример #6
0
def channel():
    with queue.connection() as conn:
        channel = conn.channel()
        channel.queue_declare(queue="test")
        yield channel
        channel.queue_delete(queue="test")
Пример #7
0
def test_queue_is_up():
    with queue.connection() as conn:
        pass