Exemplo n.º 1
0
    def proxy_route_request(self, url, environ):
        try:
            key = 'up:' + environ['REMOTE_ADDR']
            timestamp, coll, mode, cache = self.redis.hmget(
                key, 'timestamp', 'coll', 'mode', 'cache')

            #environ['pywb_redis_key'] = key
            environ[
                'pywb_proxy_default_timestamp'] = timestamp or timestamp_now()
            environ['pywb_cache'] = cache

            if coll:
                self.ensure_coll_exists(coll)

                if mode == 'replay' or coll == 'live':
                    proxy_prefix = '/' + coll + '/bn_/'
                else:
                    proxy_prefix = '/' + coll + '/record/bn_/'
            else:
                proxy_prefix = self.proxy_prefix

        except Exception as e:
            traceback.print_exc()

        return proxy_prefix + url
Exemplo n.º 2
0
    def ingest(self, text, params):

        # text already parsed
        content = text.decode('utf-8')
        title = params.get('title') or params.get('url')

        url = params.get('url')

        timestamp_s = params.get('timestamp') or timestamp_now()
        timestamp_dt = timestamp_to_iso_date(timestamp_s)
        has_screenshot_b = params.get('hasScreenshot') == '1'

        title = title or url

        digest = self.get_digest(content)

        #if self.update_if_dupe(digest, coll, url, timestamp_ss, timestamp_dts):
        #    return

        data = {
            'user_s': params.get('user'),
            'coll_s': params.get('coll'),
            'rec_s': params.get('rec'),
            'id': params.get('pid'),
            'title_t': title,
            'content_t': content,
            'url_s': url,
            'digest_s': digest,
            'timestamp_s': timestamp_s,
            'timestamp_dt': timestamp_dt,
            'has_screenshot_b': has_screenshot_b,
        }

        result = requests.post(self.solr_api, json=data)
Exemplo n.º 3
0
    def write_dns(self, host, dns):
        # write it out even if empty
        # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse?

        # the response object doesn't contain the query type 'A' or 'AAAA'
        # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A'
        kind = 'A'

        if self.writer is None:
            self.open()

        payload = timestamp_now() + '\r\n'

        for r in dns:
            try:
                payload += host + '.\t' + str(
                    r.ttl) + '\tIN\t' + kind + '\t' + r.host + '\r\n'
            except Exception as e:
                LOGGER.info('problem converting dns reply for warcing', r, e)
                pass
        payload = payload.encode('utf-8')

        record = self.writer.create_warc_record('dns:' + host,
                                                'resource',
                                                payload=BytesIO(payload),
                                                warc_content_type='text/dns',
                                                length=len(payload))

        self.writer.write_record(record)
        LOGGER.debug('wrote warc dns response record%s for host %s',
                     p(self.prefix), host)
        stats.stats_sum('warc dns' + p(self.prefix), 1)
Exemplo n.º 4
0
    def write_cdxj(self, user, cdxj_key):
        #full_filename = self.redis.hget(warc_key, self.INDEX_FILE_KEY)
        full_filename = self.get_prop(self.INDEX_FILE_KEY)
        if full_filename:
            cdxj_filename = os.path.basename(strip_prefix(full_filename))
            return cdxj_filename, full_filename

        dirname = user.get_user_temp_warc_path()

        randstr = base64.b32encode(os.urandom(5)).decode('utf-8')

        timestamp = timestamp_now()

        cdxj_filename = self.INDEX_NAME_TEMPL.format(timestamp=timestamp,
                                                     random=randstr)

        os.makedirs(dirname, exist_ok=True)

        full_filename = os.path.join(dirname, cdxj_filename)

        cdxj_list = self.redis.zrange(cdxj_key, 0, -1)

        with open(full_filename, 'wt') as out:
            for cdxj in cdxj_list:
                out.write(cdxj + '\n')
            out.flush()

        full_url = add_local_store_prefix(
            full_filename.replace(os.path.sep, '/'))
        #self.redis.hset(warc_key, self.INDEX_FILE_KEY, full_url)
        self.set_prop(self.INDEX_FILE_KEY, full_url)

        return cdxj_filename, full_filename
Exemplo n.º 5
0
    def load_index(self, params):
        # no fuzzy match for live resources
        if params.get('is_fuzzy'):
            raise NotFoundException(params['url'] + '*')

        cdx = CDXObject()
        cdx['urlkey'] = params.get('key').decode('utf-8')
        cdx['timestamp'] = timestamp_now()
        cdx['url'] = params['url']
        cdx['load_url'] = self.get_load_url(params)
        cdx['is_live'] = 'true'

        mime = params.get('content_type', '')

        if params.get('filter') and not mime:
            try:
                res = self.sesh.head(cdx['load_url'])
                if res.status_code != 405:
                    cdx['status'] = str(res.status_code)

                content_type = res.headers.get('Content-Type')
                if content_type:
                    mime = content_type.split(';')[0]

            except Exception as e:
                pass

        cdx['mime'] = mime

        return iter([cdx])
Exemplo n.º 6
0
    def test_live(self):
        url = 'http://example.com/'
        source = LiveIndexSource()
        res, errs = self.query_single_source(source, dict(url=url))

        expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now())

        assert(key_ts_res(res, 'load_url') == expected)
        assert(errs == {})
Exemplo n.º 7
0
    def test_live(self):
        url = 'http://example.com/'
        source = LiveIndexSource()
        res, errs = self.query_single_source(source, dict(url=url))

        expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now())

        assert(key_ts_res(res, 'load_url') == expected)
        assert(errs == {})
Exemplo n.º 8
0
    def proxy_route_request(self, url, environ):
        try:
            key = 'up:' + environ['REMOTE_ADDR']
            timestamp = self.redis.hget(key, 'timestamp') or timestamp_now()
            environ['pywb_redis_key'] = key
            environ['pywb_proxy_default_timestamp'] = timestamp
        except Exception as e:
            traceback.print_exc()

        return self.proxy_prefix + url
Exemplo n.º 9
0
    def load_index(self, params):
        cdx = CDXObject()
        cdx['urlkey'] = params.get('key').decode('utf-8')

        closest = params.get('closest')
        cdx['timestamp'] = closest if closest else timestamp_now()
        cdx['url'] = params['url']
        cdx['load_url'] = res_template(self.proxy_url, params)
        cdx['memento_url'] = cdx['load_url']
        return self._do_load(cdx, params)
Exemplo n.º 10
0
    def get_top_frame(self,
                      wb_url,
                      wb_prefix,
                      host_prefix,
                      env,
                      frame_mod,
                      replay_mod,
                      coll='',
                      extra_params=None):
        """
        :param rewrite.wburl.WbUrl wb_url: The WbUrl for the request this template is being rendered for
        :param str wb_prefix: The URL prefix pywb is serving the content using (e.g. http://localhost:8080/live/)
        :param str host_prefix: The host URL prefix pywb is running on (e.g. http://localhost:8080)
        :param dict env: The WSGI environment dictionary for the request this template is being rendered for
        :param str frame_mod:  The modifier to be used for framing (e.g. if_)
        :param str replay_mod: The modifier to be used in the URL of the page being replayed (e.g. mp_)
        :param str coll: The name of the collection this template is being rendered for
        :param dict extra_params: Additional parameters to be supplied to the Jninja template render method
        :return: The frame insert string
        :rtype: str
        """

        embed_url = wb_url.to_str(mod=replay_mod)

        if wb_url.timestamp:
            timestamp = wb_url.timestamp
        else:
            timestamp = timestamp_now()

        is_proxy = 'wsgiprox.proxy_host' in env

        params = {
            'host_prefix': host_prefix,
            'wb_prefix': wb_prefix,
            'wb_url': wb_url,
            'coll': coll,
            'options': {
                'frame_mod': frame_mod,
                'replay_mod': replay_mod
            },
            'embed_url': embed_url,
            'is_proxy': is_proxy,
            'timestamp': timestamp,
            'url': wb_url.get_url()
        }

        if extra_params:
            params.update(extra_params)

        if self.banner_view:
            banner_html = self.banner_view.render_to_string(env, **params)
            params['banner_html'] = banner_html

        return self.render_to_string(env, **params)
Exemplo n.º 11
0
    def get_top_frame(self, wb_url,
                      wb_prefix,
                      host_prefix,
                      env,
                      frame_mod,
                      replay_mod,
                      coll='',
                      extra_params=None):
        """
        :param rewrite.wburl.WbUrl wb_url: The WbUrl for the request this template is being rendered for
        :param str wb_prefix: The URL prefix pywb is serving the content using (e.g. http://localhost:8080/live/)
        :param str host_prefix: The host URL prefix pywb is running on (e.g. http://localhost:8080)
        :param dict env: The WSGI environment dictionary for the request this template is being rendered for
        :param str frame_mod:  The modifier to be used for framing (e.g. if_)
        :param str replay_mod: The modifier to be used in the URL of the page being replayed (e.g. mp_)
        :param str coll: The name of the collection this template is being rendered for
        :param dict extra_params: Additional parameters to be supplied to the Jninja template render method
        :return: The frame insert string
        :rtype: str
        """

        embed_url = wb_url.to_str(mod=replay_mod)

        if wb_url.timestamp:
            timestamp = wb_url.timestamp
        else:
            timestamp = timestamp_now()

        is_proxy = 'wsgiprox.proxy_host' in env

        params = {'host_prefix': host_prefix,
                  'wb_prefix': wb_prefix,
                  'wb_url': wb_url,
                  'coll': coll,

                  'options': {'frame_mod': frame_mod,
                              'replay_mod': replay_mod},

                  'embed_url': embed_url,
                  'is_proxy': is_proxy,
                  'timestamp': timestamp,
                  'url': wb_url.get_url()
                 }

        if extra_params:
            params.update(extra_params)

        if self.banner_view:
            banner_html = self.banner_view.render_to_string(env, **params)
            params['banner_html'] = banner_html

        return self.render_to_string(env, **params)
Exemplo n.º 12
0
    def __call__(self, params):
        if params.get('closest') == 'now':
            params['closest'] = timestamp_now()

        content_type = params.get('content_type')
        if content_type:
            params['filter'] = '=mime:' + content_type

        query = CDXQuery(params)

        cdx_iter, errs = self.load_index(query.params)

        cdx_iter = process_cdx(cdx_iter, query)
        return cdx_iter, dict(errs)
Exemplo n.º 13
0
    def __call__(self, params):
        if params.get('closest') == 'now':
            params['closest'] = timestamp_now()

        content_type = params.get('content_type')
        if content_type:
            params['filter'] = '=mime:' + content_type

        query = CDXQuery(params)

        cdx_iter, errs = self.load_index(query.params)

        cdx_iter = process_cdx(cdx_iter, query)
        return cdx_iter, dict(errs)
Exemplo n.º 14
0
    def get_top_frame(self,
                      wb_url,
                      wb_prefix,
                      host_prefix,
                      env,
                      frame_mod,
                      replay_mod,
                      coll='',
                      extra_params=None):

        embed_url = wb_url.to_str(mod=replay_mod)

        if wb_url.timestamp:
            timestamp = wb_url.timestamp
        else:
            timestamp = timestamp_now()

        is_proxy = 'wsgiprox.proxy_host' in env

        params = {
            'host_prefix': host_prefix,
            'wb_prefix': wb_prefix,
            'wb_url': wb_url,
            'coll': coll,
            'options': {
                'frame_mod': frame_mod,
                'replay_mod': replay_mod
            },
            'embed_url': embed_url,
            'is_proxy': is_proxy,
            'timestamp': timestamp,
            'url': wb_url.get_url()
        }

        if extra_params:
            params.update(extra_params)

        if self.banner_view:
            banner_html = self.banner_view.render_to_string(env, **params)
            params['banner_html'] = banner_html

        return self.render_to_string(env, **params)
Exemplo n.º 15
0
    def write_dns(self, dns, ttl, url):
        # write it out even if empty
        # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse?

        # the response object doesn't contain the query type 'A' or 'AAAA'
        # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A'
        kind = 'A'  # fixme IPV6

        ttl = int(ttl)
        host = url.hostname

        if self.writer is None:
            self.open()

        payload = timestamp_now() + '\r\n'

        for r in dns:
            try:
                payload += '\t'.join(
                    (host + '.', str(ttl), 'IN', kind, r['host'])) + '\r\n'
            except Exception as e:
                LOGGER.info('problem converting dns reply for warcing', host,
                            r, e)
                pass
        payload = payload.encode('utf-8')

        warc_headers_dict = OrderedDict()
        warc_headers_dict['WARC-Warcinfo-ID'] = self.warcinfo_id

        record = self.writer.create_warc_record(
            'dns:' + host,
            'resource',
            warc_content_type='text/dns',
            payload=BytesIO(payload),
            length=len(payload),
            warc_headers_dict=warc_headers_dict)

        self.writer.write_record(record)
        LOGGER.debug('wrote warc dns response record%s for host %s',
                     p(self.prefix), host)
        stats.stats_sum('warc dns' + p(self.prefix), 1)
Exemplo n.º 16
0
    def _get_pagedata(self, user, coll, rec, pagedata):
        key = self.page_key.format(user=user, coll=coll, rec=rec)

        url = pagedata['url']

        ts = pagedata.get('timestamp')
        if not ts:
            ts = pagedata.get('ts')

        if not ts:
            ts = self._get_url_ts(user, coll, rec, url)

        if not ts:
            ts = timestamp_now()

        pagedata['timestamp'] = ts
        pagedata_json = json.dumps(pagedata)

        hkey = pagedata['url'] + ' ' + pagedata['timestamp']

        return key, hkey, pagedata_json
    def process_cdxj_key(self, cdxj_key):
        base_key = cdxj_key.rsplit(':cdxj', 1)[0]
        if self.redis.exists(base_key + ':open'):
            return

        _, user, coll, rec = base_key.split(':', 3)

        user_dir = os.path.join(self.record_root_dir, user)

        warc_key = base_key + ':warc'
        warcs = self.redis.hgetall(warc_key)

        info_key = base_key + ':info'

        self.redis.publish('close_rec', info_key)

        try:
            timestamp = sec_to_timestamp(
                int(self.redis.hget(info_key, 'updated_at')))
        except:
            timestamp = timestamp_now()

        cdxj_filename = self.write_cdxj(warc_key, user_dir, cdxj_key,
                                        timestamp)

        all_done = self.commit_file(user, coll, rec, user_dir, cdxj_filename,
                                    'indexes', warc_key, cdxj_filename,
                                    self.info_index_key)

        for warc_filename in warcs.keys():
            value = warcs[warc_filename]
            done = self.commit_file(user, coll, rec, user_dir, warc_filename,
                                    'warcs', warc_key, value)

            all_done = all_done and done

        if all_done:
            print('Deleting Redis Key: ' + cdxj_key)
            self.redis.delete(cdxj_key)
            self.remove_if_empty(user_dir)
Exemplo n.º 18
0
    def write_cdxj(self, user, cdxj_key):
        """Write CDX index lines to file.

        :param RedisUniqueComponent user: user
        :param str cdxj_key: CDX index file Redis key

        :returns: CDX file filename and path
        :rtype: str and str
        """
        #full_filename = self.redis.hget(warc_key, self.INDEX_FILE_KEY)
        full_filename = self.get_prop(self.INDEX_FILE_KEY)
        if full_filename:
            cdxj_filename = os.path.basename(strip_prefix(full_filename))
            return cdxj_filename, full_filename

        dirname = user.get_user_temp_warc_path()

        randstr = base64.b32encode(os.urandom(5)).decode('utf-8')

        timestamp = timestamp_now()

        cdxj_filename = self.INDEX_NAME_TEMPL.format(timestamp=timestamp,
                                                     random=randstr)

        os.makedirs(dirname, exist_ok=True)

        full_filename = os.path.join(dirname, cdxj_filename)

        cdxj_list = self.redis.zrange(cdxj_key, 0, -1)

        with open(full_filename, 'wt') as out:
            for cdxj in cdxj_list:
                out.write(cdxj + '\n')
            out.flush()

        full_url = add_local_store_prefix(full_filename.replace(os.path.sep, '/'))
        #self.redis.hset(warc_key, self.INDEX_FILE_KEY, full_url)
        self.set_prop(self.INDEX_FILE_KEY, full_url)

        return cdxj_filename, full_filename
Exemplo n.º 19
0
    def get_transclusion_metadata(self, tc, url, orig_mime=None):
        contain_url = tc['url']
        contain_ts = tc.get('timestamp') or timestamp_now()
        contain_ts = str(contain_ts)
        selector = tc.get('selector')

        if tc.get('metadata_file'):
            with open(tc.get('metadata_file'), 'rt') as fh:
                metadata = fh.read()

        else:
            all_metadata = {}
            all_metadata['webpage_url'] = contain_url
            all_metadata['webpage_timestamp'] = contain_ts
            if selector:
                all_metadata['selector'] = selector

            formats = []

            if self.conversion_serializer:
                for file_info, _, metadata in self.conversion_serializer.find_conversions(
                        url):
                    metadata['url'] = file_info.url
                    metadata['original_url'] = url
                    formats.append(metadata)

            orig_format = {
                'url': url,
                'ext': url.rsplit('.')[-1],
                'original': True,
            }

            if orig_mime:
                orig_format['mime'] = orig_mime

            formats.append(orig_format)

            all_metadata['formats'] = formats

        return contain_url, contain_ts, all_metadata
Exemplo n.º 20
0
    def write_snapshot(self, user, collection, url, title, html_text, referrer,
                       user_agent, browser=None):

        snap_title = 'Static Snapshots'

        snap_rec_name = self.sanitize_title(snap_title)

        recording = collection.get_recording_by_name(snap_rec_name)
        if not recording:
            recording = collection.create_recording(snap_rec_name,
                                                    title=snap_rec_name)

        kwargs = dict(user=user.name,
                      coll=collection.my_id,
                      rec=quote(snap_rec_name, safe='/*'),
                      type='put_record')

        params = {'url': url}

        upstream_url = self.content_app.get_upstream_url('', kwargs, params)

        headers = {'Content-Type': 'text/html; charset=utf-8',
                   'WARC-User-Agent': user_agent,
                   'WARC-Referer': referrer,
                  }

        r = requests.put(upstream_url,
                         data=BytesIO(html_text.encode('utf-8')),
                         headers=headers,
                        )

        try:
            res = r.json()
            if res['success'] != 'true':
                print(res)
                return {'error_message': 'Snapshot Failed'}

            warc_date = res.get('WARC-Date')

        except Exception as e:
            print(e)
            return {'error_message': 'Snapshot Failed'}


        if not title:
            return {'snapshot': ''}

        if warc_date:
            timestamp = iso_date_to_timestamp(warc_date)
        else:
            timestamp = timestamp_now()


        page_data = {'url': url,
                     'title': title,
                     'timestamp': timestamp,
                     'tags': ['snapshot'],
                    }
        if browser:
            page_data['browser'] = browser

        res = recording.add_page(page_data)

        return {'snapshot': page_data}
Exemplo n.º 21
0
    def handle_download(self, user, coll_name, recs):
        user, collection = self.user_manager.get_user_coll(user, coll_name)

        if not collection:
            self._raise_error(404, 'no_such_collection')

        if not self.access.is_superuser():
            self.access.assert_can_write_coll(collection)

        # collection['uid'] = coll
        collection.load()

        Stats(self.redis).incr_download(collection)

        now = timestamp_now()

        name = coll_name
        if recs != '*':
            rec_list = recs.split(',')
            if len(rec_list) == 1:
                name = recs
            else:
                name += '-' + recs
        else:
            rec_list = None

        filename = self.download_filename.format(title=quote(name),
                                                 timestamp=now)
        loader = BlockLoader()

        coll_info = self.create_coll_warcinfo(user, collection, filename)

        def iter_infos():
            for recording in collection.get_recordings(load=True):
                if rec_list and recording.name not in rec_list:
                    continue

                warcinfo = self.create_rec_warcinfo(user, collection,
                                                    recording, filename)

                size = len(warcinfo)
                size += recording.size
                yield recording, warcinfo, size

        def read_all(infos):
            yield coll_info

            for recording, warcinfo, _ in infos:
                yield warcinfo

                for n, warc_path in recording.iter_all_files():
                    try:
                        fh = loader.load(warc_path)
                    except Exception:
                        print('Skipping invalid ' + warc_path)
                        continue

                    for chunk in StreamIter(fh):
                        yield chunk

        response.headers['Content-Type'] = 'application/octet-stream'
        response.headers[
            'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename

        # if not transfer-encoding, store infos and calculate total size
        if not self.download_chunk_encoded:
            size = len(coll_info)
            infos = list(iter_infos())
            size += sum(size for r, i, size in infos)

            response.headers['Content-Length'] = size
            return read_all(infos)

        else:
            # stream everything
            response.headers['Transfer-Encoding'] = 'chunked'

            return read_all(iter_infos())
Exemplo n.º 22
0
    def write_snapshot(self,
                       user,
                       coll,
                       url,
                       title,
                       html_text,
                       referrer,
                       user_agent,
                       browser=None):

        snap_title = 'Static Snapshots'

        snap_rec = self.sanitize_title(snap_title)

        if not self.manager.has_recording(user, coll, snap_rec):
            recording = self.manager.create_recording(user, coll, snap_rec,
                                                      snap_title)

        kwargs = dict(user=user,
                      coll=quote(coll),
                      rec=quote(snap_rec, safe='/*'),
                      type='snapshot')

        params = {'url': url}

        upstream_url = self.manager.content_app.get_upstream_url(
            '', kwargs, params)

        headers = {
            'Content-Type': 'text/html; charset=utf-8',
            'WARC-User-Agent': user_agent,
            'WARC-Referer': referrer,
        }

        r = requests.put(
            upstream_url,
            data=BytesIO(html_text.encode('utf-8')),
            headers=headers,
        )

        try:
            res = r.json()
            if res['success'] != 'true':
                print(res)
                return {'error_message': 'Snapshot Failed'}

            warc_date = res.get('WARC-Date')

        except Exception as e:
            print(e)
            return {'error_message': 'Snapshot Failed'}

        if not title:
            return {'snapshot': ''}

        if warc_date:
            timestamp = iso_date_to_timestamp(warc_date)
        else:
            timestamp = timestamp_now()

        page_data = {
            'url': url,
            'title': title,
            'timestamp': timestamp,
            'tags': ['snapshot'],
        }
        if browser:
            page_data['browser'] = browser

        res = self.manager.add_page(user, coll, snap_rec, page_data)

        return {'snapshot': page_data}
Exemplo n.º 23
0
    def handle_download(self, user, coll_name, recs):
        user, collection = self.user_manager.get_user_coll(user, coll_name)

        if not collection:
            self._raise_error(404, 'no_such_collection')

        self.access.assert_can_write_coll(collection)

        #collection['uid'] = coll
        collection.load()

        Stats(self.redis).incr_download(collection)

        now = timestamp_now()

        name = coll_name
        if recs != '*':
            rec_list = recs.split(',')
            if len(rec_list) == 1:
                name = recs
            else:
                name += '-' + recs
        else:
            rec_list = None

        filename = self.download_filename.format(title=quote(name),
                                                 timestamp=now)
        loader = BlockLoader()

        coll_info = self.create_coll_warcinfo(user, collection, filename)

        def iter_infos():
            for recording in collection.get_recordings(load=True):
                if rec_list and recording.name not in rec_list:
                    continue

                warcinfo = self.create_rec_warcinfo(user,
                                                    collection,
                                                    recording,
                                                    filename)

                size = len(warcinfo)
                size += recording.size
                yield recording, warcinfo, size

        def read_all(infos):
            yield coll_info

            for recording, warcinfo, _ in infos:
                yield warcinfo

                for n, warc_path in recording.iter_all_files():
                    try:
                        fh = loader.load(warc_path)
                    except:
                        print('Skipping invalid ' + warc_path)
                        continue

                    for chunk in StreamIter(fh):
                        yield chunk

        response.headers['Content-Type'] = 'application/octet-stream'
        response.headers['Content-Disposition'] = "attachment; filename*=UTF-8''" + filename

        # if not transfer-encoding, store infos and calculate total size
        if not self.download_chunk_encoded:
            size = len(coll_info)
            infos = list(iter_infos())
            size += sum(size for r, i, size in infos)

            response.headers['Content-Length'] = size
            return read_all(infos)

        else:
        # stream everything
            response.headers['Transfer-Encoding'] = 'chunked'

            return read_all(iter_infos())
Exemplo n.º 24
0
    def handle_download(self, user, coll, rec):
        collection = self.manager.get_collection(user, coll, rec)
        if not collection:
            self._raise_error(404, 'Collection not found', id=coll)

        now = timestamp_now()

        name = collection['id']
        if rec != '*':
            rec_list = rec.split(',')
            if len(rec_list) == 1:
                name = rec
            else:
                name += '-' + rec
        else:
            rec_list = None

        filename = self.download_filename.format(title=quote(name),
                                                 timestamp=now)
        loader = BlockLoader()

        coll_info = self.create_coll_warcinfo(user, collection, filename)

        def iter_infos():
            for recording in collection['recordings']:
                if rec_list and recording['id'] not in rec_list:
                    continue

                warcinfo = self.create_rec_warcinfo(user, collection,
                                                    recording, filename)

                size = len(warcinfo)
                size += recording['size']
                yield recording, warcinfo, size

        def read_all(infos):
            yield coll_info

            for recording, warcinfo, _ in infos:
                yield warcinfo

                for warc_path in self._iter_all_warcs(user, coll,
                                                      recording['id']):
                    try:
                        fh = loader.load(warc_path)
                    except:
                        print('Skipping invalid ' + warc_path)
                        continue

                    for chunk in StreamIter(fh):
                        yield chunk

        response.headers['Content-Type'] = 'application/octet-stream'
        response.headers[
            'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename

        # if not transfer-encoding, store infos and calculate total size
        if not self.download_chunk_encoded:
            size = len(coll_info)
            infos = list(iter_infos())
            size += sum(size for r, i, size in infos)

            response.headers['Content-Length'] = size
            return read_all(infos)

        else:
            # stream everything
            response.headers['Transfer-Encoding'] = 'chunked'

            return read_all(iter_infos())