示例#1
0
def run_harchiver():
    """Maintains a connection to the queue."""

    while True:
        channel = None
        try:
            logger.info("Setting up warc writer, in %s" %
                        settings.output_directory)
            warcwriter = WarcWriterPool(gzip=True,
                                        output_dir=settings.output_directory)
            logger.info("Starting connection: %s" % (settings.amqp_url))
            parameters = pika.URLParameters(settings.amqp_url)
            connection = pika.BlockingConnection(parameters)
            channel = connection.channel()
            channel.exchange_declare(exchange=settings.exchange,
                                     type="direct",
                                     durable=True,
                                     auto_delete=False)
            channel.queue_declare(queue=settings.in_queue,
                                  durable=True,
                                  exclusive=False,
                                  auto_delete=False)
            channel.queue_bind(queue=settings.in_queue,
                               exchange=settings.exchange,
                               routing_key=settings.binding_key)
            channel.basic_qos(prefetch_count=settings.qos_num)
            logger.info("Started connection: %s" % (settings.amqp_url))
            for method_frame, properties, body in channel.consume(
                    settings.in_queue):
                handled = callback(warcwriter, body)
                if handled is True:
                    channel.basic_ack(method_frame.delivery_tag)
                else:
                    channel.basic_reject(
                        delivery_tag=method_frame.delivery_tag, requeue=True)

        except Exception as e:
            logger.error("Error: %s" % e)
            if channel and channel.is_open and not channel.is_closing:
                try:
                    requeued_messages = channel.cancel()
                    logger.info("Requeued %i messages" % requeued_messages)
                except Exception as e:
                    logger.warning("Could not cancel/shutdown neatly.")
            if warcwriter:
                warcwriter.cleanup()
            logger.warning("Sleeping for 15 seconds before retrying...")
            time.sleep(15)
        except KeyboardInterrupt:
            # Tidy up:
            if warcwriter:
                warcwriter.cleanup()
            # quit
            sys.exit()
示例#2
0
def run_harchiver():
    """Maintains a connection to the queue."""

    while True:
        channel = None
        try:
            logger.info("Setting up warc writer, in %s" % settings.output_directory)
            warcwriter = WarcWriterPool(gzip=True, output_dir=settings.output_directory)
            logger.info("Starting connection: %s" % (settings.amqp_url))
            parameters = pika.URLParameters(settings.amqp_url)
            connection = pika.BlockingConnection(parameters)
            channel = connection.channel()
            channel.exchange_declare(exchange=settings.exchange,
                                     type="direct", 
                                     durable=True, 
                                     auto_delete=False)
            channel.queue_declare(queue=settings.in_queue, 
                                  durable=True, 
                                  exclusive=False, 
                                  auto_delete=False)
            channel.queue_bind(queue=settings.in_queue, 
                   exchange=settings.exchange,
                   routing_key=settings.binding_key)
            channel.basic_qos(prefetch_count=settings.qos_num)
            logger.info("Started connection: %s" % (settings.amqp_url))
            for method_frame, properties, body in channel.consume(settings.in_queue):
                handled = callback(warcwriter, body)
                if handled is True:
                    channel.basic_ack(method_frame.delivery_tag)
                else:
                    channel.basic_reject(delivery_tag = method_frame.delivery_tag, requeue=True)
                
        except Exception as e:
            logger.error("Error: %s" % e)
            if channel and channel.is_open and not channel.is_closing:
                try:
                    requeued_messages = channel.cancel()
                    logger.info("Requeued %i messages" % requeued_messages)
                except Exception as e:
                    logger.warning("Could not cancel/shutdown neatly.")
            if warcwriter:
                warcwriter.cleanup()
            logger.warning("Sleeping for 15 seconds before retrying...")
            time.sleep(15)
        except KeyboardInterrupt:
            # Tidy up:
            if warcwriter:
                warcwriter.cleanup()
            # quit
            sys.exit()            
示例#3
0
                videoblock = "".join( [ httpheaders( video.raw._original_response ), video.content ] )
                writemetadata( video_url, video_uuid, base64.b64encode(r.content), index, page )
            else:
                video_url = url
                video_date = warc_datetime_str( datetime.now() )
                video_type = WarcRecord.RESOURCE
                content_type = "video/mp4"
                writemetadata( video_url, video_uuid, base64.b64encode( etree.tostring( object ).strip() ), index, page )
                videoblock = streamvideo( video_url )
                if len( videoblock ) == 0 or videoblock is None:
                    print "ERROR: Couldn't stream video; %s" % video_url
                    continue
            headers = [
                ( WarcRecord.TYPE, video_type ),
                ( WarcRecord.URL, video_url ),
                ( WarcRecord.DATE, video_date ),
                ( WarcRecord.ID, video_uuid ),
                ( WarcRecord.CONTENT_TYPE, content_type ),
            ]
            warcwriter.write_record( headers, content_type, videoblock )

if __name__ == "__main__":
    warcwriter = WarcWriterPool( gzip=True, write_warcinfo=False )
    for arg in sys.argv[ 1: ]:
        if arg[ 0 ].isdigit():
            timestamp, url = arg.split( "/", 1 )
            getvideo( url, timestamp=timestamp )
        else:
            getvideo(sys.argv[1])
    warcwriter.cleanup()
示例#4
0
    elif args.filename:
        data = None
        with open(args.filename, "rb") as d:
            data = d.read()
        if len(data) == 0 or data is None:
            print "ERROR: %s" % args.filename
            sys.exit(1)
        mime, encoding = mimetypes.guess_type(args.filename)
        mtime = os.stat(args.filename).st_mtime
        warcdate = warc_datetime_str(datetime.fromtimestamp(mtime))
        video_uuid = "<urn:uuid:%s>" % uuid.uuid1()
        if args.multiple:
            for pair in args.multiple.split(","):
                t, p = pair.split("/", 1)
                write_metadata(args.url, video_uuid, t, args.xpath, p,
                               warcdate)
        else:
            write_metadata(args.url, video_uuid, args.timestamp, args.xpath,
                           args.page, warcdate)
        headers = [
            (WarcRecord.TYPE, WarcRecord.RESOURCE),
            (WarcRecord.URL, args.url),
            (WarcRecord.DATE,
             warc_datetime_str(datetime.fromtimestamp(mtime))),
            (WarcRecord.ID, video_uuid),
            (WarcRecord.CONTENT_TYPE, mime),
        ]
        warcwriter.write_record(headers, mime, data)
    warcwriter.cleanup()
    logger = logging.getLogger( "archiver" )
    logger.setLevel( logging.WARNING )
    logger.addHandler( logging.StreamHandler( sys.stdout ) )
    logging.root.setLevel( logging.WARNING )

    parser = argparse.ArgumentParser( description="Archiving tweets." )
    parser.add_argument( "-u", "--users", type=str, help="Comma-separated list of users to follow." )
    parser.add_argument( "-t", "--terms", type=str, help="Comma-separated list of terms to track." )
    args = parser.parse_args()

    users = []
    terms = []
    if args.users is not None:
        users = args.users.split( "," )
    if args.terms is not None:
        terms = args.terms.split( "," )
    if len( users + terms ) == 0:
        parser.print_help()
        sys.exit( 1 )

    w = WarcWriterPool( gzip=True )
    try:
        if len(users) > 0:
            users = screen_names_to_ids(auth, users)
        stream = tweepy.Stream( auth=auth, listener=StreamListener( writer=w ) )
        stream.filter( follow=users, track=terms )
    except KeyboardInterrupt as k:
        w.cleanup()
        sys.exit( 0 )