예제 #1
0
def run_harchiver():
    """Maintains a connection to the queue."""

    while True:
        channel = None
        try:
            logger.info("Setting up warc writer, in %s" % settings.output_directory)
            warcwriter = WarcWriterPool(gzip=True, output_dir=settings.output_directory)
            logger.info("Starting connection: %s" % (settings.amqp_url))
            parameters = pika.URLParameters(settings.amqp_url)
            connection = pika.BlockingConnection(parameters)
            channel = connection.channel()
            channel.exchange_declare(exchange=settings.exchange,
                                     type="direct", 
                                     durable=True, 
                                     auto_delete=False)
            channel.queue_declare(queue=settings.in_queue, 
                                  durable=True, 
                                  exclusive=False, 
                                  auto_delete=False)
            channel.queue_bind(queue=settings.in_queue, 
                   exchange=settings.exchange,
                   routing_key=settings.binding_key)
            channel.basic_qos(prefetch_count=settings.qos_num)
            logger.info("Started connection: %s" % (settings.amqp_url))
            for method_frame, properties, body in channel.consume(settings.in_queue):
                handled = callback(warcwriter, body)
                if handled is True:
                    channel.basic_ack(method_frame.delivery_tag)
                else:
                    channel.basic_reject(delivery_tag = method_frame.delivery_tag, requeue=True)
                
        except Exception as e:
            logger.error("Error: %s" % e)
            if channel and channel.is_open and not channel.is_closing:
                try:
                    requeued_messages = channel.cancel()
                    logger.info("Requeued %i messages" % requeued_messages)
                except Exception as e:
                    logger.warning("Could not cancel/shutdown neatly.")
            if warcwriter:
                warcwriter.cleanup()
            logger.warning("Sleeping for 15 seconds before retrying...")
            time.sleep(15)
        except KeyboardInterrupt:
            # Tidy up:
            if warcwriter:
                warcwriter.cleanup()
            # quit
            sys.exit()            
예제 #2
0
def run_harchiver():
    """Maintains a connection to the queue."""

    while True:
        channel = None
        try:
            logger.info("Setting up warc writer, in %s" %
                        settings.output_directory)
            warcwriter = WarcWriterPool(gzip=True,
                                        output_dir=settings.output_directory)
            logger.info("Starting connection: %s" % (settings.amqp_url))
            parameters = pika.URLParameters(settings.amqp_url)
            connection = pika.BlockingConnection(parameters)
            channel = connection.channel()
            channel.exchange_declare(exchange=settings.exchange,
                                     type="direct",
                                     durable=True,
                                     auto_delete=False)
            channel.queue_declare(queue=settings.in_queue,
                                  durable=True,
                                  exclusive=False,
                                  auto_delete=False)
            channel.queue_bind(queue=settings.in_queue,
                               exchange=settings.exchange,
                               routing_key=settings.binding_key)
            channel.basic_qos(prefetch_count=settings.qos_num)
            logger.info("Started connection: %s" % (settings.amqp_url))
            for method_frame, properties, body in channel.consume(
                    settings.in_queue):
                handled = callback(warcwriter, body)
                if handled is True:
                    channel.basic_ack(method_frame.delivery_tag)
                else:
                    channel.basic_reject(
                        delivery_tag=method_frame.delivery_tag, requeue=True)

        except Exception as e:
            logger.error("Error: %s" % e)
            if channel and channel.is_open and not channel.is_closing:
                try:
                    requeued_messages = channel.cancel()
                    logger.info("Requeued %i messages" % requeued_messages)
                except Exception as e:
                    logger.warning("Could not cancel/shutdown neatly.")
            if warcwriter:
                warcwriter.cleanup()
            logger.warning("Sleeping for 15 seconds before retrying...")
            time.sleep(15)
        except KeyboardInterrupt:
            # Tidy up:
            if warcwriter:
                warcwriter.cleanup()
            # quit
            sys.exit()
예제 #3
0
def run_harchiver():
    """Maintains a connection to the queue."""

    warcwriter = WarcWriterPool(gzip=True, output_dir=settings.OUTPUT_DIRECTORY)
    while True:
        channel = None
        try:
            logger.info("Starting connection: %s" % (settings.AMQP_URL))
            parameters = pika.URLParameters(settings.AMQP_URL)
            connection = pika.BlockingConnection(parameters)
            channel = connection.channel()
            channel.exchange_declare(exchange=settings.AMQP_EXCHANGE,
                                     type="direct", 
                                     durable=True, 
                                     auto_delete=False)
            channel.queue_declare(queue=settings.AMQP_QUEUE, 
                                  durable=True, 
                                  exclusive=False, 
                                  auto_delete=False)
            channel.queue_bind(queue=settings.AMQP_QUEUE, 
                   exchange=settings.AMQP_EXCHANGE,
                   routing_key=settings.AMQP_KEY)
            logger.info("Started connection: %s" % (settings.AMQP_URL))
            for method_frame, properties, body in channel.consume(settings.AMQP_QUEUE):
                callback(warcwriter, body)
                channel.basic_ack(method_frame.delivery_tag)
        except Exception as e:
            logger.error("Error: %s" % e)
            if channel and channel.is_open and not channel.is_closing:
                try:
                    requeued_messages = channel.cancel()
                    logger.info("Requeued %i messages" % requeued_messages)
                except Exception as e:
                    logger.warning("Could not cancel/shutdown neatly.")
            logger.warning("Sleeping for 15 seconds before retrying...")
            time.sleep(15)
예제 #4
0
                videoblock = "".join( [ httpheaders( video.raw._original_response ), video.content ] )
                writemetadata( video_url, video_uuid, base64.b64encode(r.content), index, page )
            else:
                video_url = url
                video_date = warc_datetime_str( datetime.now() )
                video_type = WarcRecord.RESOURCE
                content_type = "video/mp4"
                writemetadata( video_url, video_uuid, base64.b64encode( etree.tostring( object ).strip() ), index, page )
                videoblock = streamvideo( video_url )
                if len( videoblock ) == 0 or videoblock is None:
                    print "ERROR: Couldn't stream video; %s" % video_url
                    continue
            headers = [
                ( WarcRecord.TYPE, video_type ),
                ( WarcRecord.URL, video_url ),
                ( WarcRecord.DATE, video_date ),
                ( WarcRecord.ID, video_uuid ),
                ( WarcRecord.CONTENT_TYPE, content_type ),
            ]
            warcwriter.write_record( headers, content_type, videoblock )

if __name__ == "__main__":
    warcwriter = WarcWriterPool( gzip=True, write_warcinfo=False )
    for arg in sys.argv[ 1: ]:
        if arg[ 0 ].isdigit():
            timestamp, url = arg.split( "/", 1 )
            getvideo( url, timestamp=timestamp )
        else:
            getvideo(sys.argv[1])
    warcwriter.cleanup()
예제 #5
0
        ]
        warcwriter.write_record( headers, mime, data )

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument( "-m", dest="multiple", help="Multiple, comma-separated timestamp/page values." )
    parser.add_argument( "-p", dest="page", help="Embedding page." )
    parser.add_argument( "-t", dest="timestamp", help="Embedding page timestamp." )
    parser.add_argument( "-x", dest="xpath", help="XPath to element." )
    parser.add_argument( "-u", dest="url", help="Video URL." )
    parser.add_argument( "-f", dest="filename", help="Filename on disk." )
    parser.add_argument( "-l", dest="playlist", help="Playlist of videos." )
    parser.add_argument( "-y", action="store_true", help="YouTube videos [iframes only]." )
    
    args = parser.parse_args()
    warcwriter = WarcWriterPool( gzip=True, write_warcinfo=False )
    
    if args.playlist:
        write_playlist( args.page, args.timestamp, args.xpath, args.playlist, args.filename )
    elif not args.filename:
        if args.y:
            r = requests.get( args.page )
            if not r.ok:
                print "ERROR: %s" % r.content
                sys.exit( 1 )
            ydl = youtube_dl.YoutubeDL()
            ydl.add_default_info_extractors()
            htmlparser = etree.HTMLParser()
            root = etree.fromstring( r.content, htmlparser )
            for iframe in root.xpath( "//iframe[contains(@src,'www.youtube.com/embed/')]" ):
                yurl = iframe.attrib["src"]
예제 #6
0
        dest="multiple",
        help="Multiple, comma-separated timestamp/page values.")
    parser.add_argument("-p", dest="page", help="Embedding page.")
    parser.add_argument("-t",
                        dest="timestamp",
                        help="Embedding page timestamp.")
    parser.add_argument("-x", dest="xpath", help="XPath to element.")
    parser.add_argument("-u", dest="url", help="Video URL.")
    parser.add_argument("-f", dest="filename", help="Filename on disk.")
    parser.add_argument("-l", dest="playlist", help="Playlist of videos.")
    parser.add_argument("-y",
                        action="store_true",
                        help="YouTube videos [iframes only].")

    args = parser.parse_args()
    warcwriter = WarcWriterPool(gzip=True, write_warcinfo=False)

    if args.playlist:
        write_playlist(args.page, args.timestamp, args.xpath, args.playlist,
                       args.filename)
    elif not args.filename:
        if args.y:
            r = requests.get(args.page)
            if not r.ok:
                print "ERROR: %s" % r.content
                sys.exit(1)
            ydl = youtube_dl.YoutubeDL()
            ydl.add_default_info_extractors()
            htmlparser = etree.HTMLParser()
            root = etree.fromstring(r.content, htmlparser)
            for iframe in root.xpath(
예제 #7
0
    logger = logging.getLogger( "archiver" )
    logger.setLevel( logging.WARNING )
    logger.addHandler( logging.StreamHandler( sys.stdout ) )
    logging.root.setLevel( logging.WARNING )

    parser = argparse.ArgumentParser( description="Archiving tweets." )
    parser.add_argument( "-u", "--users", type=str, help="Comma-separated list of users to follow." )
    parser.add_argument( "-t", "--terms", type=str, help="Comma-separated list of terms to track." )
    args = parser.parse_args()

    users = []
    terms = []
    if args.users is not None:
        users = args.users.split( "," )
    if args.terms is not None:
        terms = args.terms.split( "," )
    if len( users + terms ) == 0:
        parser.print_help()
        sys.exit( 1 )

    w = WarcWriterPool( gzip=True )
    try:
        if len(users) > 0:
            users = screen_names_to_ids(auth, users)
        stream = tweepy.Stream( auth=auth, listener=StreamListener( writer=w ) )
        stream.filter( follow=users, track=terms )
    except KeyboardInterrupt as k:
        w.cleanup()
        sys.exit( 0 )

예제 #8
0
            video_url = url
            video_date = warc_datetime_str(datetime.now())
            video_type = WarcRecord.RESOURCE
            content_type = "video/mp4"
            writemetadata(video_url, video_uuid,
                          base64.b64encode(etree.tostring(object).strip()),
                          index, page)
            videoblock = streamvideo(video_url)
            if len(videoblock) == 0 or videoblock is None:
                print "ERROR: Couldn't stream video; %s" % video_url
                continue
        headers = [
            (WarcRecord.TYPE, video_type),
            (WarcRecord.URL, video_url),
            (WarcRecord.DATE, video_date),
            (WarcRecord.ID, video_uuid),
            (WarcRecord.CONTENT_TYPE, content_type),
        ]
        warcwriter.write_record(headers, content_type, videoblock)


if __name__ == "__main__":
    warcwriter = WarcWriterPool(gzip=True, write_warcinfo=False)
    for arg in sys.argv[1:]:
        if arg[0].isdigit():
            timestamp, url = arg.split("/", 1)
            getvideo(url, timestamp=timestamp)
        else:
            getvideo(sys.argv[1])
    warcwriter.cleanup()