class TestWARC: def setUp(self): self.tempdir = tempfile.mkdtemp() d = os.path.dirname(os.path.abspath(index.__file__)) p = os.path.join(d, 'example.warc.gz') q = os.path.join(self.tempdir, 'example.warc.gz') shutil.copy(p, q) self.w = WARC(q) def tearDown(self): shutil.rmtree(self.tempdir) def test_iter(self): assert_equals(10, len(list(self.w))) def test_no_repeats(self): d = os.path.dirname(os.path.abspath(index.__file__)) p = os.path.join(d, 'example.warc.gz') for r in WARC(p): self.w.add(r) s = set() for r in self.w: assert_not_in(r.headers['WARC-Record-ID'], s) s.add(r.headers['WARC-Record-ID'])
def process_warcs(h, name, to_dir): logger = logging.getLogger('archive-%s' % name) try: logger.info('Processing WARCs into days') from_dir = os.path.dirname(h.get_job_info(name)['job']['primaryConfig']) for r in filter_records(from_dir): date, time = r.headers['WARC-Date'].split('T', 1) w = WARC(os.path.join(to_dir, date + ".warc"), order_by='WARC-Date') w.add(r) w.save() except KeyError: logger.error('Couldn\'t find primaryConfig') logger.error(str(h.get_job_info(name)))
import os import archive from github.WARC import WARC jobs_dir = '/cs/research/fmedia/data5/wmayor/github/heritrix-3.1.1/jobs' warcs_dir = '/cs/research/fmedia/data5/wmayor/github/warcs' for e in os.listdir(jobs_dir): p = os.path.join(jobs_dir, e) if os.path.isdir(p): print p for r in archive.filter_records(p): date, time = r.headers['WARC-Date'].split('T', 1) w = WARC(os.path.join(warcs_dir, date + ".warc"), order_by='WARC-Date') w.add(r) w.save() print 'Done!'