def load_archive_info_xml(self, url): self.archive_infos = {} logging.debug('Loading XML from {0}'.format(url)) if not url: return try: stream = BlockLoader().load(url) except Exception as e: logging.debug(e) logging.debug('Proceeding without xml archive info') return root = ElementTree.fromstring(stream.read()) for link in root.findall('link'): name = link.get('id') archive = link.find('archive') timegate = link.find('timegate') if timegate is None or archive is None: continue rewritten = (archive.get('rewritten-urls') == 'yes') unrewritten_url = archive.get('un-rewritten-api-url', '') uri = timegate.get('uri') self.archive_infos[name] = {'uri': uri, 'rewritten': rewritten, 'unrewritten_url': unrewritten_url }
def load_archive_info_json(self, url): self.archive_infos = {} url = os.path.expandvars(url) logging.debug('Loading XML from {0}'.format(url)) if not url: return try: stream = BlockLoader().load(url) except Exception as e: logging.debug(e) logging.debug('Proceeding without json archive info') return archives = json.loads(stream.read()) for arc in archives: id_ = arc['id'] name = arc['name'] uri = arc['timegate'] base_url = arc.get('base_url', uri) unrewritten_url = arc.get('unrewritten_url') if not unrewritten_url: unrewritten_url = base_url + '{timestamp}id_/{url}' self.archive_infos[id_] = { 'id': id_, 'uri': uri, 'name': name, 'base_url': base_url, 'unrewritten_url': unrewritten_url }
def load_archive_info_xml(self, url): self.archive_infos = {} url = os.path.expandvars(url) logging.debug('Loading XML from {0}'.format(url)) if not url: return try: stream = BlockLoader().load(url) except Exception as e: logging.debug(e) logging.debug('Proceeding without xml archive info') return root = ElementTree.fromstring(stream.read()) for link in root.findall('link'): name = link.get('id') longname = link.get('longname') archive = link.find('archive') timegate = link.find('timegate') if timegate is None or archive is None: continue rewritten = (archive.get('rewritten-urls') == 'yes') unrewritten_url = archive.get('un-rewritten-api-url', '') uri = timegate.get('uri') self.archive_infos[name] = { 'uri': uri, 'rewritten': rewritten, 'unrewritten_url': unrewritten_url, 'name': longname }
def load_archive_info_json(self, url): self.archive_infos = {} url = os.path.expandvars(url) logging.debug('Loading XML from {0}'.format(url)) if not url: return try: stream = BlockLoader().load(url) except Exception as e: logging.debug(e) logging.debug('Proceeding without json archive info') return archives = json.loads(stream.read()) for arc in archives: id_ = arc['id'] name = arc['name'] uri = arc['timegate'] unrewritten_url = arc.get('unrewritten_url') if not unrewritten_url: unrewritten_url = uri + '{timestamp}id_/{url}' self.archive_infos[id_] = {'id': id_, 'uri': uri, 'name': name, 'rewritten': True, 'unrewritten_url': unrewritten_url}
def test_s3_read_2(): pytest.importorskip('boto3') res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html') buff = res.read() assert len(buff) == 2082 reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'<!DOCTYPE html>\n'
def test_s3_read_1(): pytest.importorskip('boto') res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', offset=53235662, length=2526) buff = res.read() assert len(buff) == 2526 reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n'
def test_s3_read_1(): pytest.importorskip('boto') res = BlockLoader().load('s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', offset=53235662, length=2526) buff = res.read() assert len(buff) == 2526 reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n'