Пример #1
0
class TestWARC:

    def setUp(self):
        self.tempdir = tempfile.mkdtemp()
        d = os.path.dirname(os.path.abspath(index.__file__))
        p = os.path.join(d, 'example.warc.gz')
        q = os.path.join(self.tempdir, 'example.warc.gz')
        shutil.copy(p, q)
        self.w = WARC(q)


    def tearDown(self):
        shutil.rmtree(self.tempdir)

    def test_iter(self):
        assert_equals(10, len(list(self.w)))

    def test_no_repeats(self):
        d = os.path.dirname(os.path.abspath(index.__file__))
        p = os.path.join(d, 'example.warc.gz')
        for r in WARC(p):
            self.w.add(r)
        s = set()
        for r in self.w:
            assert_not_in(r.headers['WARC-Record-ID'], s)
            s.add(r.headers['WARC-Record-ID'])
Пример #2
0
def process_warcs(h, name, to_dir):
    logger = logging.getLogger('archive-%s' % name)
    try:
        logger.info('Processing WARCs into days')
        from_dir = os.path.dirname(h.get_job_info(name)['job']['primaryConfig'])
        for r in filter_records(from_dir):
            date, time = r.headers['WARC-Date'].split('T', 1)
            w = WARC(os.path.join(to_dir, date + ".warc"), order_by='WARC-Date')
            w.add(r)
            w.save()
    except KeyError:
        logger.error('Couldn\'t find primaryConfig')
        logger.error(str(h.get_job_info(name)))
Пример #3
0
 def setUp(self):
     self.tempdir = tempfile.mkdtemp()
     d = os.path.dirname(os.path.abspath(index.__file__))
     p = os.path.join(d, 'example.warc.gz')
     q = os.path.join(self.tempdir, 'example.warc.gz')
     shutil.copy(p, q)
     self.w = WARC(q)
Пример #4
0
import os

import archive
from github.WARC import WARC

jobs_dir = '/cs/research/fmedia/data5/wmayor/github/heritrix-3.1.1/jobs'
warcs_dir = '/cs/research/fmedia/data5/wmayor/github/warcs'

for e in os.listdir(jobs_dir):
    p = os.path.join(jobs_dir, e)
    if os.path.isdir(p):
        print p
        for r in archive.filter_records(p):
            date, time = r.headers['WARC-Date'].split('T', 1)
            w = WARC(os.path.join(warcs_dir, date + ".warc"), order_by='WARC-Date')
            w.add(r)
            w.save()

print 'Done!'