예제 #1
0
    def lookup_revisit(self, lookup_params, digest, url, iso_dt):
        params = {}
        for param in lookup_params:
            if param.startswith('param.'):
                params[param] = lookup_params[param]

        params['url'] = url
        params['closest'] = iso_date_to_timestamp(iso_dt)

        filters = []

        filters.append('!mime:warc/revisit')

        if digest and digest != '-':
            filters.append('digest:' + digest.split(':')[-1])

        params['filter'] = filters

        cdx_iter, errs = self.cdx_lookup(params)

        for cdx in cdx_iter:
            res = self.dupe_policy(cdx, params)
            if res:
                return res

        return None
예제 #2
0
    def extract_text(self, record):
        url = record.rec_headers.get('WARC-Target-URI')
        date = record.rec_headers.get('WARC-Date')

        id_ = iso_date_to_timestamp(date) + '/' + url
        if id_ not in self.pages:
            return

        mime = self.get_record_mime_type(record)

        if mime not in HTML_MIME_TYPES:
            return

        if record.http_headers and record.http_headers.get_statuscode().startswith('3'):
            return

        extractor = extractors.ArticleExtractor()

        content = record.content_stream().read()

        try:
            content = content.decode("utf-8")

            doc = extractor.get_doc(content)

            if doc.content:
                self.pages[id_]["text"] = doc.content

            if doc.title:
                self.pages[id_]["title"] = doc.title
        except
            # skip text extraction in case of errors
            pass
예제 #3
0
    def lookup_revisit(self, lookup_params, digest, url, iso_dt):
        params = {}
        for param in lookup_params:
            if param.startswith('param.'):
                params[param] = lookup_params[param]

        params['url'] = url
        params['closest'] = iso_date_to_timestamp(iso_dt)

        filters = []

        filters.append('!mime:warc/revisit')

        if digest and digest != '-':
            filters.append('digest:' + digest.split(':')[-1])

        params['filter'] = filters

        cdx_iter, errs = self.cdx_lookup(params)

        for cdx in cdx_iter:
            res = self.dupe_policy(cdx, params)
            if res:
                return res

        return None
예제 #4
0
파일: util.py 프로젝트: miku/wacz-format
def construct_passed_pages_dict(passed_content):
    """Creates a dictionary of the passed pages with the url as the key or ts/url if ts is present and the title and text as the values if they have been passed"""
    passed_pages_dict = {}
    for i in range(0, len(passed_content)):
        # Skip the file's header if it's been set
        header = json.loads(passed_content[i])
        if "format" not in header:
            pages_dict = dict(header)

            # Set the default key as url
            key = "%s" % pages_dict["url"]

            # If timestamp is present overwrite the key to be 'ts/url'
            if "ts" in pages_dict:
                key = "%s/%s" % (
                    iso_date_to_timestamp(pages_dict["ts"]),
                    pages_dict["url"],
                )

            # Add the key to the dictionary with a blank value
            passed_pages_dict[key] = {}

            # If title was in the passed pages line add it to the value of the just created dictionary entry
            if "title" in pages_dict:
                passed_pages_dict[key]["title"] = pages_dict["title"]

            # If text was in the passed pages line add it to the value of the just created dictionary entry
            if "text" in pages_dict:
                passed_pages_dict[key]["text"] = pages_dict["text"]

    return passed_pages_dict
예제 #5
0
    def _load_different_url_payload(self, cdx, headers_record, failed_files,
                                    cdx_loader):
        """
        Handle the case where a duplicate of a capture with same digest
        exists at a different url.

        If a cdx_server is provided, a query is made for matching
        url, timestamp and digest.

        Raise exception if no matches found.
        """

        digest = cdx.get('digest', '-')

        # if the digest is the empty record digest, don't attempt to look up the payload record!
        # the payload is simply empty, so use empty payload of existing record
        if digest == self.EMPTY_DIGEST:
            return headers_record

        ref_target_uri = (
            headers_record.rec_headers.get_header('WARC-Refers-To-Target-URI'))

        target_uri = headers_record.rec_headers.get_header('WARC-Target-URI')

        # if no target uri, no way to find the original
        if not ref_target_uri:
            raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)

        ref_target_date = (
            headers_record.rec_headers.get_header('WARC-Refers-To-Date'))

        if not ref_target_date:
            ref_target_date = cdx['timestamp']
        else:
            ref_target_date = iso_date_to_timestamp(ref_target_date)

        try:
            orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
                                                    ref_target_date, digest,
                                                    cdx_loader)
        except NotFoundException:
            raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)

        for orig_cdx in orig_cdx_lines:
            try:
                payload_record = self._resolve_path_load(
                    orig_cdx, False, failed_files)
                return payload_record

            except ArchiveLoadFailed as e:
                pass

        raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
예제 #6
0
    def parse_warc_record(self, record):
        """ Parse warc record
        """

        entry = self._create_index_entry(record.rec_type)

        if record.rec_type == 'warcinfo':
            entry['url'] = record.rec_headers.get_header('WARC-Filename')
            entry['urlkey'] = entry['url']
            entry['_warcinfo'] = record.raw_stream.read(record.length)
            return entry

        entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')

        # timestamp
        entry['timestamp'] = iso_date_to_timestamp(record.rec_headers.
                                                   get_header('WARC-Date'))

        # mime
        if record.rec_type == 'revisit':
            entry['mime'] = 'warc/revisit'
        elif self.options.get('minimal'):
            entry['mime'] = '-'
        else:
            def_mime = '-' if record.rec_type == 'request' else 'unk'
            entry.extract_mime(record.http_headers.
                               get_header('Content-Type'),
                               def_mime)
            # detected mime from WARC-Identified-Payload-Type
            entry['mime-detected'] = record.rec_headers.get_header(
                                        'WARC-Identified-Payload-Type')

        # status -- only for response records (by convention):
        if record.rec_type == 'response' and not self.options.get('minimal'):
            entry.extract_status(record.http_headers)
        else:
            entry['status'] = '-'

        # digest
        digest = record.rec_headers.get_header('WARC-Payload-Digest')
        entry['digest'] = digest
        if digest and digest.startswith('sha1:'):
            entry['digest'] = digest[len('sha1:'):]

        elif not entry.get('digest'):
            entry['digest'] = '-'

        # optional json metadata, if present
        metadata = record.rec_headers.get_header('WARC-Json-Metadata')
        if metadata:
            entry['metadata'] = metadata

        return entry
예제 #7
0
    def init_proxy(self, config):
        """Initialize and start proxy mode. If proxy configuration entry is not contained in the config
        this is a no op. Causes handler to become an instance of WSGIProxMiddleware.

        :param dict config: The configuration object used to configure this instance of FrontEndApp
        """
        proxy_config = config.get('proxy')
        if not proxy_config:
            return

        if isinstance(proxy_config, str):
            proxy_coll = proxy_config
            proxy_config = {}
        else:
            proxy_coll = proxy_config['coll']

        if '/' in proxy_coll:
            raise Exception('Proxy collection can not contain "/"')

        proxy_config['ca_name'] = proxy_config.get('ca_name', self.PROXY_CA_NAME)
        proxy_config['ca_file_cache'] = proxy_config.get('ca_file_cache', self.PROXY_CA_PATH)

        if proxy_config.get('recording'):
            logging.info('Proxy recording into collection "{0}"'.format(proxy_coll))
            if proxy_coll in self.warcserver.list_fixed_routes():
                raise Exception('Can not record into fixed collection')

            proxy_coll += self.RECORD_ROUTE
            if not config.get('recorder'):
                config['recorder'] = 'live'

        else:
            logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll))

        if proxy_config.get('enable_content_rewrite', True):
            self.proxy_prefix = '/{0}/bn_/'.format(proxy_coll)
        else:
            self.proxy_prefix = '/{0}/id_/'.format(proxy_coll)

        self.proxy_default_timestamp = proxy_config.get('default_timestamp')
        if self.proxy_default_timestamp:
            if not self.ALL_DIGITS.match(self.proxy_default_timestamp):
                try:
                    self.proxy_default_timestamp = iso_date_to_timestamp(self.proxy_default_timestamp)
                except:
                    raise Exception('Invalid Proxy Timestamp: Must Be All-Digit Timestamp or ISO Date Format')

        self.proxy_coll = proxy_coll

        self.handler = WSGIProxMiddleware(self.handle_request,
                                          self.proxy_route_request,
                                          proxy_host=proxy_config.get('host', 'pywb.proxy'),
                                          proxy_options=proxy_config)
예제 #8
0
    def parse_warc_record(self, record):
        """ Parse warc record
        """

        entry = self._create_index_entry(record.rec_type)

        if record.rec_type == 'warcinfo':
            entry['url'] = record.rec_headers.get_header('WARC-Filename')
            entry['urlkey'] = entry['url']
            entry['_warcinfo'] = record.raw_stream.read(record.length)
            return entry

        entry['url'] = record.rec_headers.get_header('WARC-Target-Uri')

        # timestamp
        entry['timestamp'] = iso_date_to_timestamp(
            record.rec_headers.get_header('WARC-Date'))

        # mime
        if record.rec_type == 'revisit':
            entry['mime'] = 'warc/revisit'
        elif self.options.get('minimal'):
            entry['mime'] = '-'
        else:
            def_mime = '-' if record.rec_type == 'request' else 'unk'
            entry.extract_mime(record.http_headers.get_header('Content-Type'),
                               def_mime)
            # detected mime from WARC-Identified-Payload-Type
            entry['mime-detected'] = record.rec_headers.get_header(
                'WARC-Identified-Payload-Type')

        # status -- only for response records (by convention):
        if record.rec_type == 'response' and not self.options.get('minimal'):
            entry.extract_status(record.http_headers)
        else:
            entry['status'] = '-'

        # digest
        digest = record.rec_headers.get_header('WARC-Payload-Digest')
        entry['digest'] = digest
        if digest and digest.startswith('sha1:'):
            entry['digest'] = digest[len('sha1:'):]

        elif not entry.get('digest'):
            entry['digest'] = '-'

        # optional json metadata, if present
        metadata = record.rec_headers.get_header('WARC-Json-Metadata')
        if metadata:
            entry['metadata'] = metadata

        return entry
예제 #9
0
    def _load_different_url_payload(self, cdx, headers_record,
                                    failed_files, cdx_loader):
        """
        Handle the case where a duplicate of a capture with same digest
        exists at a different url.

        If a cdx_server is provided, a query is made for matching
        url, timestamp and digest.

        Raise exception if no matches found.
        """

        ref_target_uri = (headers_record.rec_headers.
                          get_header('WARC-Refers-To-Target-URI'))

        target_uri = headers_record.rec_headers.get_header('WARC-Target-URI')

        # if no target uri, no way to find the original
        if not ref_target_uri:
            raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)

        ref_target_date = (headers_record.rec_headers.
                           get_header('WARC-Refers-To-Date'))

        if not ref_target_date:
            ref_target_date = cdx['timestamp']
        else:
            ref_target_date = iso_date_to_timestamp(ref_target_date)

        digest = cdx.get('digest', '-')

        try:
            orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
                                                    ref_target_date,
                                                    digest,
                                                    cdx_loader)
        except NotFoundException:
            raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)

        for orig_cdx in orig_cdx_lines:
            try:
                payload_record = self._resolve_path_load(orig_cdx, False,
                                                         failed_files)
                return payload_record

            except ArchiveLoadFailed as e:
                pass

        raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
예제 #10
0
    def _write_line(self, out, index, record, filename):
        url = index.get("url")
        if not url:
            url = record.rec_headers.get("WARC-Target-URI")

        dt = record.rec_headers.get("WARC-Date")

        ts = iso_date_to_timestamp(dt)

        if hasattr(record, "urlkey"):
            urlkey = record.urlkey
        else:
            urlkey = self.get_url_key(url)

        self._do_write(urlkey, ts, index, out)
예제 #11
0
    def _write_line(
        self, out, index, record, filename
    ):  # Possibility 1: add custom staff here through the record, raw stream
        url = index.get('url')
        if not url:
            url = record.rec_headers.get('WARC-Target-URI')

        dt = record.rec_headers.get('WARC-Date')

        ts = iso_date_to_timestamp(dt)

        if hasattr(record, 'urlkey'):
            urlkey = record.urlkey
        else:
            urlkey = self.get_url_key(url)

        self._do_write(urlkey, ts, index, out)
예제 #12
0
    def extract_text(self, record):
        url = record.rec_headers.get('WARC-Target-URI')
        date = record.rec_headers.get('WARC-Date')
        ts = iso_date_to_timestamp(date)
        id_ = ts + '/' + url

        if self.main_url and url == self.main_url:
            print('Found Main Url: {0}'.format(url))
            self.pages[id_] = {'timestamp': ts, 'url': url, 'title': url}

        mime = self.get_record_mime_type(record)

        if mime not in HTML_MIME_TYPES:
            return

        status = record.http_headers.get_statuscode()
        if record.http_headers and status.startswith('3'):
            return

        if id_ not in self.pages:
            if self.detect_pages:
                self.pages[id_] = {'timestamp': ts, 'url': url, 'title': url}
            else:
                return

        content = self._read_record(record)
        if not content:
            return

        try:
            extractor = extractors.ArticleExtractor()

            content = content.decode("utf-8")

            doc = extractor.get_doc(content)

            if doc.content:
                self.pages[id_]["text"] = doc.content

            if doc.title:
                self.pages[id_]["title"] = doc.title

        except Exception as e:
            print(e)
            # skip text extraction in case of errors
            pass
예제 #13
0
    def create_wr_metadata(self, log, rec_title):
        pagelist = []

        for page in log['pages']:
            if not page['title'].startswith(('http:', 'https:')):
                continue

            pagelist.append(dict(title=page['title'],
                                 url=page['title'],
                                 timestamp=iso_date_to_timestamp(page['startedDateTime'])))

        metadata = {"title": rec_title,
                    "type": "recording",
                   }

        if pagelist:
            metadata["pages"] = pagelist

        return metadata
예제 #14
0
    def _write_line(self, out, index, record, filename):
        url = index.get('url')
        if not url:
            url = record.rec_headers.get('WARC-Target-URI')

        dt = record.rec_headers.get('WARC-Date')

        ts = iso_date_to_timestamp(dt)

        # # TODO do my own index search here
        if self.current_raw_body != '':
            index['javascript'] = custom_index.javascript_index(
                self.current_raw_body)

        if hasattr(record, 'urlkey'):
            urlkey = record.urlkey
        else:
            urlkey = self.get_url_key(url)

        self._do_write(urlkey, ts, index, out)
예제 #15
0
    def check_pages_and_text(self, record):
        url = record.rec_headers.get("WARC-Target-URI")
        date = record.rec_headers.get("WARC-Date")
        ts = iso_date_to_timestamp(date)
        id_ = ts + "/" + url
        matched_id = ""
        # Check for both a matching url/ts and url entry
        if id_ in self.passed_pages_dict:
            matched_id = id_
        if url in self.passed_pages_dict:
            matched_id = url
        # If we find a match build a record
        if matched_id != "":
            self.pages[matched_id] = {
                "timestamp": ts,
                "url": url,
                "title": url
            }
            # Add title and text if they've been provided
            if "title" in self.passed_pages_dict[matched_id]:
                self.pages[matched_id]["title"] = self.passed_pages_dict[
                    matched_id]["title"]
            if "text" in self.passed_pages_dict[matched_id]:
                self.pages[matched_id]["text"] = self.passed_pages_dict[
                    matched_id]["text"]
            # Delete the entry from our pages_dict so we can't match it again
            del self.passed_pages_dict[matched_id]

        if (self.main_url and self.main_url == url and self.main_ts
                and self.main_ts == ts):
            self.main_ts_flag = True
            self.main_url_flag = True
            print("Found Main Url: {0}".format(url))
            print("Found Main ts: {0}".format(ts))
            # If were not relying on passed in pages we want to add all records to the self.pages object
            if self.passed_pages_dict == {}:
                self.pages[id_] = {"timestamp": ts, "url": url, "title": url}
        if self.main_url and self.main_url == url and self.main_ts == None:
            self.main_url_flag = True
            print("Found Main Url: {0}".format(url))
            if id_ not in self.pages:
                self.pages[id_] = {"timestamp": ts, "url": url, "title": url}

        mime = self.get_record_mime_type(record)

        if mime not in HTML_MIME_TYPES:
            return

        status = record.http_headers.get_statuscode()
        if record.http_headers and status.startswith("3"):
            return

        if id_ not in self.pages:

            if self.detect_pages:
                self.pages[id_] = {"timestamp": ts, "url": url, "title": url}
            else:
                return

        # if not extracting text, then finish here
        if not self.extract_text:
            return

        content = self._read_record(record)
        if not content:
            return

        try:
            extractor = extractors.ArticleExtractor()

            content = content.decode("utf-8")

            doc = extractor.get_doc(content)

            curr_page = self.pages[id_]

            if doc.content:
                self.pages[id_]["text"] = doc.content
                self.has_text = True

            # only set title if unset, or set to url (default)
            # avoid overriding user-specified title, if any
            if doc.title and self.pages[id_].get("title", url) == url:
                self.pages[id_]["title"] = doc.title

        except Exception as e:
            # skip text extraction in case of errors
            print("Skipping, Text Extraction Failed For: " + url)
            print(e)
예제 #16
0
    def init_proxy(self, config):
        """Initialize and start proxy mode. If proxy configuration entry is not contained in the config
        this is a no op. Causes handler to become an instance of WSGIProxMiddleware.

        :param dict config: The configuration object used to configure this instance of FrontEndApp
        """
        proxy_config = config.get('proxy')
        if not proxy_config:
            return

        if isinstance(proxy_config, str):
            proxy_coll = proxy_config
            proxy_config = {}
        else:
            proxy_coll = proxy_config['coll']

        if '/' in proxy_coll:
            raise Exception('Proxy collection can not contain "/"')

        proxy_config['ca_name'] = proxy_config.get('ca_name',
                                                   self.PROXY_CA_NAME)
        proxy_config['ca_file_cache'] = proxy_config.get(
            'ca_file_cache', self.PROXY_CA_PATH)

        if proxy_config.get('recording'):
            logging.info(
                'Proxy recording into collection "{0}"'.format(proxy_coll))
            if proxy_coll in self.warcserver.list_fixed_routes():
                raise Exception('Can not record into fixed collection')

            proxy_route = proxy_coll + self.RECORD_ROUTE
            if not config.get('recorder'):
                config['recorder'] = 'live'

            self.proxy_record = True

        else:
            logging.info(
                'Proxy enabled for collection "{0}"'.format(proxy_coll))
            self.proxy_record = False
            proxy_route = proxy_coll

        if proxy_config.get('enable_content_rewrite', True):
            self.proxy_prefix = '/{0}/bn_/'.format(proxy_route)
        else:
            self.proxy_prefix = '/{0}/id_/'.format(proxy_route)

        self.proxy_default_timestamp = proxy_config.get('default_timestamp')
        if self.proxy_default_timestamp:
            if not self.ALL_DIGITS.match(self.proxy_default_timestamp):
                try:
                    self.proxy_default_timestamp = iso_date_to_timestamp(
                        self.proxy_default_timestamp)
                except Exception:
                    raise Exception(
                        'Invalid Proxy Timestamp: Must Be All-Digit Timestamp or ISO Date Format'
                    )

        self.proxy_coll = proxy_coll

        self.handler = WSGIProxMiddleware(self.handle_request,
                                          self.proxy_route_request,
                                          proxy_host=proxy_config.get(
                                              'host', 'pywb.proxy'),
                                          proxy_options=proxy_config)
예제 #17
0
    def write_snapshot(self,
                       user,
                       coll,
                       url,
                       title,
                       html_text,
                       referrer,
                       user_agent,
                       browser=None):

        snap_title = 'Static Snapshots'

        snap_rec = self.sanitize_title(snap_title)

        if not self.manager.has_recording(user, coll, snap_rec):
            recording = self.manager.create_recording(user, coll, snap_rec,
                                                      snap_title)

        kwargs = dict(user=user,
                      coll=quote(coll),
                      rec=quote(snap_rec, safe='/*'),
                      type='snapshot')

        params = {'url': url}

        upstream_url = self.manager.content_app.get_upstream_url(
            '', kwargs, params)

        headers = {
            'Content-Type': 'text/html; charset=utf-8',
            'WARC-User-Agent': user_agent,
            'WARC-Referer': referrer,
        }

        r = requests.put(
            upstream_url,
            data=BytesIO(html_text.encode('utf-8')),
            headers=headers,
        )

        try:
            res = r.json()
            if res['success'] != 'true':
                print(res)
                return {'error_message': 'Snapshot Failed'}

            warc_date = res.get('WARC-Date')

        except Exception as e:
            print(e)
            return {'error_message': 'Snapshot Failed'}

        if not title:
            return {'snapshot': ''}

        if warc_date:
            timestamp = iso_date_to_timestamp(warc_date)
        else:
            timestamp = timestamp_now()

        page_data = {
            'url': url,
            'title': title,
            'timestamp': timestamp,
            'tags': ['snapshot'],
        }
        if browser:
            page_data['browser'] = browser

        res = self.manager.add_page(user, coll, snap_rec, page_data)

        return {'snapshot': page_data}
예제 #18
0
    def check_pages_and_text(self, record):
        url = record.rec_headers.get("WARC-Target-URI")
        date = record.rec_headers.get("WARC-Date")
        ts = iso_date_to_timestamp(date)
        id_ = ts + "/" + url

        if (
            self.main_url
            and self.main_url == url
            and self.main_ts
            and self.main_ts == ts
        ):
            self.main_ts_flag = True
            self.main_url_flag = True
            print("Found Main Url: {0}".format(url))
            print("Found Main ts: {0}".format(ts))
            self.pages[id_] = {"timestamp": ts, "url": url, "title": url}
        if self.main_url and self.main_url == url and self.main_ts == None:
            self.main_url_flag = True
            print("Found Main Url: {0}".format(url))
            self.pages[id_] = {"timestamp": ts, "url": url, "title": url}

        mime = self.get_record_mime_type(record)

        if mime not in HTML_MIME_TYPES:
            return

        status = record.http_headers.get_statuscode()
        if record.http_headers and status.startswith("3"):
            return

        if id_ not in self.pages:
            if self.detect_pages:
                self.pages[id_] = {"timestamp": ts, "url": url, "title": url}
            else:
                return

        # if not extracting text, then finish here
        if not self.extract_text:
            return

        content = self._read_record(record)
        if not content:
            return

        try:
            extractor = extractors.ArticleExtractor()

            content = content.decode("utf-8")

            doc = extractor.get_doc(content)

            curr_page = self.pages[id_]

            if doc.content:
                self.pages[id_]["text"] = doc.content
                self.has_text = True

            # only set title if unset, or set to url (default)
            # avoid overriding user-specified title, if any
            if doc.title and self.pages[id_].get("title", url) == url:
                self.pages[id_]["title"] = doc.title

        except Exception as e:
            # skip text extraction in case of errors
            print("Skipping, Text Extraction Failed For: " + url)
            print(e)
예제 #19
0
    def write_snapshot(self, user, collection, url, title, html_text, referrer,
                       user_agent, browser=None):

        snap_title = 'Static Snapshots'

        snap_rec_name = self.sanitize_title(snap_title)

        recording = collection.get_recording_by_name(snap_rec_name)
        if not recording:
            recording = collection.create_recording(snap_rec_name,
                                                    title=snap_rec_name)

        kwargs = dict(user=user.name,
                      coll=collection.my_id,
                      rec=quote(snap_rec_name, safe='/*'),
                      type='put_record')

        params = {'url': url}

        upstream_url = self.content_app.get_upstream_url('', kwargs, params)

        headers = {'Content-Type': 'text/html; charset=utf-8',
                   'WARC-User-Agent': user_agent,
                   'WARC-Referer': referrer,
                  }

        r = requests.put(upstream_url,
                         data=BytesIO(html_text.encode('utf-8')),
                         headers=headers,
                        )

        try:
            res = r.json()
            if res['success'] != 'true':
                print(res)
                return {'error_message': 'Snapshot Failed'}

            warc_date = res.get('WARC-Date')

        except Exception as e:
            print(e)
            return {'error_message': 'Snapshot Failed'}


        if not title:
            return {'snapshot': ''}

        if warc_date:
            timestamp = iso_date_to_timestamp(warc_date)
        else:
            timestamp = timestamp_now()


        page_data = {'url': url,
                     'title': title,
                     'timestamp': timestamp,
                     'tags': ['snapshot'],
                    }
        if browser:
            page_data['browser'] = browser

        res = recording.add_page(page_data)

        return {'snapshot': page_data}