def test_iterator(self): """ Test iterator semantics on 3 record WARC """ with open(get_test_file('example-iana.org-chunked.warc'), 'rb') as fh: a = ArchiveIterator(fh) for record in a: assert record.rec_type == 'warcinfo' break record = next(a) assert record.rec_type == 'response' for record in a: assert record.rec_type == 'request' break with pytest.raises(StopIteration): record = next(a) assert a.record == None assert a.read_to_end() == None
def parse_uploaded(self, stream, expected_size): """Parse WARC archive. :param stream: file object :param int expected_size: expected WARC archive size :returns: list of recordings (indices) :rtype: list """ arciterator = ArchiveIterator(stream, no_record_parse=True, verify_http=True, block_size=BLOCK_SIZE) infos = [] last_indexinfo = None indexinfo = None is_first = True remote_archives = None for record in arciterator: warcinfo = None if record.rec_type == 'warcinfo': try: warcinfo = self.parse_warcinfo(record) except Exception as e: print('Error Parsing WARCINFO') traceback.print_exc() elif remote_archives is not None: source_uri = record.rec_headers.get('WARC-Source-URI') if source_uri: if self.wam_loader: res = self.wam_loader.find_archive_for_url(source_uri) if res: remote_archives.add(res[2]) arciterator.read_to_end(record) if last_indexinfo: last_indexinfo['offset'] = arciterator.member_info[0] last_indexinfo = None if warcinfo and 'json-metadata' in warcinfo: self.add_index_info(infos, indexinfo, arciterator.member_info[0]) indexinfo = warcinfo.get('json-metadata') indexinfo['offset'] = None if 'title' not in indexinfo: indexinfo['title'] = 'Uploaded Recording' if 'type' not in indexinfo: indexinfo['type'] = 'recording' indexinfo['ra'] = set() remote_archives = indexinfo['ra'] last_indexinfo = indexinfo elif is_first: indexinfo = { 'type': 'recording', 'title': 'Uploaded Recording', 'offset': 0, } if is_first and warcinfo and 'software' in warcinfo: indexinfo['warcinfo:software'] = warcinfo['software'] indexinfo['warcinfo:datetime'] = record.rec_headers.get( 'WARC-Date') is_first = False if indexinfo: self.add_index_info(infos, indexinfo, stream.tell()) # if anything left over, likely due to WARC error, consume remainder if stream.tell() < expected_size: while True: buff = stream.read(8192) if not buff: break return infos
def parse_uploaded(self, stream, expected_size): arciterator = ArchiveIterator(stream, no_record_parse=True, verify_http=True, block_size=BLOCK_SIZE) infos = [] last_indexinfo = None indexinfo = None is_first = True for record in arciterator: warcinfo = None if record.rec_type == 'warcinfo': try: warcinfo = self.parse_warcinfo(record) except Exception as e: print('Error Parsing WARCINFO') traceback.print_exc() arciterator.read_to_end(record) if last_indexinfo: last_indexinfo['offset'] = arciterator.member_info[0] last_indexinfo = None if warcinfo: self.add_index_info(infos, indexinfo, arciterator.member_info[0]) indexinfo = warcinfo.get('json-metadata') indexinfo['offset'] = None if 'title' not in indexinfo: indexinfo['title'] = 'Uploaded Recording' if 'type' not in indexinfo: indexinfo['type'] = 'recording' last_indexinfo = indexinfo elif is_first: indexinfo = { 'type': 'recording', 'title': 'Uploaded Recording', 'offset': 0, } is_first = False if indexinfo: self.add_index_info(infos, indexinfo, stream.tell()) # if anything left over, likely due to WARC error, consume remainder if stream.tell() < expected_size: while True: buff = stream.read(8192) if not buff: break return infos
def parse_uploaded(self, stream, expected_size): """Parse WARC archive. :param stream: file object :param int expected_size: expected WARC archive size :returns: list of recordings (indices) :rtype: list """ arciterator = ArchiveIterator(stream, no_record_parse=True, verify_http=True, block_size=BLOCK_SIZE) infos = [] last_indexinfo = None indexinfo = None is_first = True remote_archives = None for record in arciterator: warcinfo = None if record.rec_type == 'warcinfo': try: warcinfo = self.parse_warcinfo(record) except Exception as e: print('Error Parsing WARCINFO') traceback.print_exc() elif remote_archives is not None: source_uri = record.rec_headers.get('WARC-Source-URI') if source_uri: if self.wam_loader: res = self.wam_loader.find_archive_for_url(source_uri) if res: remote_archives.add(res[2]) arciterator.read_to_end(record) if last_indexinfo: last_indexinfo['offset'] = arciterator.member_info[0] last_indexinfo = None if warcinfo and 'json-metadata' in warcinfo: self.add_index_info(infos, indexinfo, arciterator.member_info[0]) indexinfo = warcinfo.get('json-metadata') indexinfo['offset'] = None if 'title' not in indexinfo: indexinfo['title'] = 'Uploaded Recording' if 'type' not in indexinfo: indexinfo['type'] = 'recording' indexinfo['ra'] = set() remote_archives = indexinfo['ra'] last_indexinfo = indexinfo elif is_first: indexinfo = {'type': 'recording', 'title': 'Uploaded Recording', 'offset': 0, } if is_first and warcinfo and 'software' in warcinfo: indexinfo['warcinfo:software'] = warcinfo['software'] indexinfo['warcinfo:datetime'] = record.rec_headers.get('WARC-Date') is_first = False if indexinfo: self.add_index_info(infos, indexinfo, stream.tell()) # if anything left over, likely due to WARC error, consume remainder if stream.tell() < expected_size: while True: buff = stream.read(8192) if not buff: break return infos