def __init__(self, warc_source: str, warc_target: str=None):
     self.warc_source = warc_source
     self.warc_target = warc_target
     self._response_records = {}
     self._log = Log()
     self._log.log('Original WARC file is {}.'.format(self.warc_source))
     self._log.log('Deduplicated WARC file is {}.'.format(self.warc_target))
     if os.path.isfile(self.warc_target):
         self._log.log('File {} already exists.'.format(self.warc_target))
         raise Exception('File {} already exists.'.format(self.warc_target))
class Warc:
    def __init__(self, warc_source: str, warc_target: str = None):
        self.warc_source = warc_source
        self.warc_target = warc_target
        self._response_records = {}
        self._log = Log()
        self._log.log('Original WARC file is {}.'.format(self.warc_source))
        self._log.log('Deduplicated WARC file is {}.'.format(self.warc_target))
        if os.path.isfile(self.warc_target):
            self._log.log('File {} already exists.'.format(self.warc_target))
            raise Exception('File {} already exists.'.format(self.warc_target))
        self._session = requests.Session()

    def deduplicate(self):
        self._log.log('Start deduplication process.')
        with open(self.warc_source, 'rb') as s, \
                open(self.warc_target, 'wb') as t:
            writer = WARCWriter(filebuf=t,
                                gzip=self.warc_target.endswith('.gz'))
            for record in ArchiveIterator(s):
                url = record.rec_headers.get_header('WARC-Target-URI')
                record_id = record.rec_headers.get_header('WARC-Record-ID')
                self._log.log('Processing record {}.'.format(record_id))
                if url is not None and url.startswith('<'):
                    url = re.search('^<(.+)>$', url).group(1)
                    self._log.log('Replacing URL in record {} with {}.'.format(
                        record_id, url))
                    record.rec_headers.replace_header('WARC-Target-URI', url)
                if record.rec_headers.get_header('WARC-Type') == 'response':
                    self._log.log('Deduplicating record {}.'.format(record_id))
                    data = self.get_duplicate(record)
                    print(data)
                    if data:
                        self._log.log(
                            'Record {} is a duplicate from {}.'.format(
                                record_id, data))
                        writer.write_record(
                            self.response_to_revisit(writer, record, data))
                    else:
                        if data is False:
                            self._log.log(
                                'Record {} could not be deduplicated.'.format(
                                    record_id))
                        else:
                            self._log.log(
                                'Record {} is not a duplicate.'.format(
                                    record_id))
                        self.register_response(record)
                        writer.write_record(record)
                elif record.rec_headers.get_header('WARC-Type') == 'warcinfo':
                    self._log.set_warcinfo(
                        record.rec_headers.get_header('WARC-Record-ID'))
                    record.rec_headers.replace_header('WARC-Filename',
                                                      self.warc_target)
                    writer.write_record(record)
                else:
                    writer.write_record(record)
            self._log.log('Writing log to WARC.')
            writer.write_record(self._log.create_record(writer))

    def register_response(self, record):
        key = (record.rec_headers.get_header('WARC-Payload-Digest'),
               record.rec_headers.get_header('WARC-Target-URI'))
        self._response_records[key] = {
            'record-id': record.rec_headers.get_header('WARC-Record-ID'),
            'date': record.rec_headers.get_header('WARC-Date'),
            'target-uri': record.rec_headers.get_header('WARC-Target-URI')
        }

    @staticmethod
    def response_to_revisit(writer, record, data):
        warc_headers = record.rec_headers
        if 'record-id' in data and data['record-id'] is not None:
            warc_headers.replace_header('WARC-Refers-To', data['record-id'])
        warc_headers.replace_header('WARC-Refers-To-Date', data['date'])
        warc_headers.replace_header('WARC-Refers-To-Target-URI',
                                    data['target-uri'])
        warc_headers.replace_header('WARC-Type', 'revisit')
        warc_headers.replace_header('WARC-Truncated', 'length')
        warc_headers.replace_header('WARC-Profile',
                                    'http://netpreserve.org/warc/1.0/' \
                                    'revisit/identical-payload-digest')
        warc_headers.remove_header('WARC-Block-Digest')
        warc_headers.remove_header('Content-Length')
        return writer.create_warc_record(
            record.rec_headers.get_header('WARC-Target-URI'),
            'revisit',
            warc_headers=warc_headers,
            http_headers=record.http_headers)

    def get_duplicate(self, record):
        key = (record.rec_headers.get_header('WARC-Payload-Digest'),
               record.rec_headers.get_header('WARC-Target-URI'))
        if key in self._response_records:
            return self._response_records[key]


#        date = record.rec_headers.get_header('WARC-Date')
#        date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
#        date = date.strftime('%Y%m%d%H%M%S')
        api_response = self.get_ia_duplicate(record,
                                             range='to',
                                             date='201905310000')
        if api_response:
            return api_response
        api_response = self.get_ia_duplicate(record,
                                             range='from',
                                             date='20190703000')
        if api_response:
            return api_response
        return

    def get_ia_duplicate(self, record, range, date):
        digest = record.rec_headers.get_header('WARC-Payload-Digest')
        uri = record.rec_headers.get_header('WARC-Target-URI')
        record_id = record.rec_headers.get_header('WARC-Record-ID')
        self._log.log(
            'Requesting URL http://wwwb-dedup.us.archive.org:8083/cdx/search'
            '?url={}'.format(urllib.parse.quote(uri)) + '&limit=100'
            '&filter=digest:{}'.format(digest.split(':')[1]) +
            '&fl=timestamp,original'
            '&{}={}'.format(range,
                            int(date) - 1) + '&filter=!mimetype:warc\/revisit')
        success, response = get(
            'http://wwwb-dedup.us.archive.org:8083/cdx/search'
            '?url={}'.format(urllib.parse.quote(uri)) + '&limit=100'
            '&filter=digest:{}'.format(digest.split(':')[1]) +
            '&fl=timestamp,original'
            '&{}={}'.format(range,
                            int(date) - 1) + '&filter=!mimetype:warc\/revisit',
            sleep_time=1,
            max_tries=10,
            timeout=10,
            session=self._session)
        self._log.log('Received a response from URL {}.'.format(response.url))
        if len(response.text.strip()) == 0:
            return None
        if 'org.archive.wayback.exception.RobotAccessControlException' in response.text:
            self._log.log(
                'Record {} is blocked by robots.txt.'.format(record_id))
            return False
        if 'org.archive.wayback.exception.AdministrativeAccessControlException' in response.text:
            self._log.log(
                'Record {} is excluded from the CDX API.'.format(record_id))
            return False
        if 'Requested Line is too large' in response.text:
            self._log.log('Record {} has a too large URL.'.format(record_id))
            return False
        if not success:
            self._log.log(
                'Record {} got a bad CDX API response.'.format(record_id))
            return False
        for line in response.text.splitlines():
            if not re.search('^[0-9]{14}\s+https?://', line):
                continue
            break
        else:
            self._log.log(
                'Record {} for an invalid CDX API response'.format(record_id))
            return False
        data = line.strip().split(' ', 1)
        return {
            'target-uri': data[1],
            'date': datetime.datetime.strptime(data[0], '%Y%m%d%H%M%S'). \
                strftime('%Y-%m-%dT%H:%M:%SZ')
        }

    @property
    def warc_target(self) -> str:
        return self._warc_target

    @warc_target.setter
    def warc_target(self, value: str):
        if value is not None:
            self._warc_target = value
        self._warc_target = create_warc_target(self.warc_source)
class Warc:
    def __init__(self, warc_source: str, warc_target: str=None):
        self.warc_source = warc_source
        self.warc_target = warc_target
        self._response_records = {}
        self._log = Log()
        self._log.log('Original WARC file is {}.'.format(self.warc_source))
        self._log.log('Deduplicated WARC file is {}.'.format(self.warc_target))
        if os.path.isfile(self.warc_target):
            self._log.log('File {} already exists.'.format(self.warc_target))
            raise Exception('File {} already exists.'.format(self.warc_target))

    def deduplicate(self):
        self._log.log('Start deduplication process.')

        iaData = {} # dict of (payload digest, URL) => IA response|None
        with open(self.warc_source, 'rb') as s:
            for record in ArchiveIterator(s):
                if record.rec_headers.get_header('WARC-Type') == 'response':
                    iaData[(record.rec_headers.get_header('WARC-Payload-Digest'), record.rec_headers.get_header('WARC-Target-URI'))] = None

        self.fetch_from_ia(iaData)

        with open(self.warc_source, 'rb') as s, \
                open(self.warc_target, 'wb') as t:
            writer = WARCWriter(filebuf=t, gzip=self.warc_target.endswith('.gz'))
            for record in ArchiveIterator(s):
                url = record.rec_headers.get_header('WARC-Target-URI')
                record_id = record.rec_headers.get_header('WARC-Record-ID')
                self._log.log('Processing record {}.'.format(record_id))
                if url is not None and url.startswith('<'):
                    url = re.search('^<(.+)>$', url).group(1)
                    self._log.log('Replacing URL in record {} with {}.'
                                  .format(record_id, url))
                    record.rec_headers.replace_header('WARC-Target-URI', url)
                if record.rec_headers.get_header('WARC-Type') == 'response':
                    self._log.log('Deduplicating record {}.'.format(record_id))
                    key = (record.rec_headers.get_header('WARC-Payload-Digest'), record.rec_headers.get_header('WARC-Target-URI'))
                    assert key in iaData
                    if iaData[key]:
                        self._log.log('Record {} is a duplicate from {}.'
                                      .format(record_id, iaData[key]))
                        writer.write_record(
                            self.response_to_revisit(writer, record, iaData[key])
                        )
                    else:
                        if iaData[key] is False:
                            self._log.log('Record {} could not be deduplicated.'
                                .format(record_id))
                        else:
                            self._log.log('Record {} is not a duplicate.'
                                .format(record_id))
                        self.register_response(record)
                        writer.write_record(record)
                elif record.rec_headers.get_header('WARC-Type') == 'warcinfo':
                    self._log.set_warcinfo(record.rec_headers.get_header('WARC-Record-ID'))
                    record.rec_headers.replace_header('WARC-Filename', self.warc_target)
                    writer.write_record(record)
                else:
                    writer.write_record(record)
            self._log.log('Writing log to WARC.')
            writer.write_record(self._log.create_record(writer))

    def register_response(self, record):
        key = (
            record.rec_headers.get_header('WARC-Payload-Digest'),
            record.rec_headers.get_header('WARC-Target-URI')
        )
        self._response_records[key] = {
            'record-id': record.rec_headers.get_header('WARC-Record-ID'),
            'date': record.rec_headers.get_header('WARC-Date'),
            'target-uri': record.rec_headers.get_header('WARC-Target-URI')
        }

    @staticmethod
    def response_to_revisit(writer, record, data):
        warc_headers = record.rec_headers
        if 'record-id' in data and data['record-id'] is not None:
            warc_headers.replace_header('WARC-Refers-To', data['record-id'])
        warc_headers.replace_header('WARC-Refers-To-Date', data['date'])
        warc_headers.replace_header('WARC-Refers-To-Target-URI',
                                    data['target-uri'])
        warc_headers.replace_header('WARC-Type', 'revisit')
        warc_headers.replace_header('WARC-Truncated', 'length')
        warc_headers.replace_header('WARC-Profile',
                                    'http://netpreserve.org/warc/1.0/' \
                                    'revisit/identical-payload-digest')
        warc_headers.remove_header('WARC-Block-Digest')
        warc_headers.remove_header('Content-Length')
        return writer.create_warc_record(
            record.rec_headers.get_header('WARC-Target-URI'),
            'revisit',
            warc_headers=warc_headers,
            http_headers=record.http_headers
        )

    async def fetch_single(self, key, session):
        digest, uri = key
        for tofrom, date in (('to', '201905310000'), ('from', '20190703000')):
            for i in range(10):
                try:
                    async with session.get(
                      'http://wwwb-dedup.us.archive.org:8083/cdx/search'
                      '?url={}'.format(urllib.parse.quote(uri)) +
                      '&limit=100'
                      '&filter=digest:{}'.format(digest.split(':')[1]) +
                      '&fl=timestamp,original'
                      '&{}={}'.format(tofrom, date) +
                      '&filter=!mimetype:warc\/revisit') as resp:
                        return key, await resp.text()
                except aiohttp.ClientError as e:
                    pass
        return key, None

    async def fetch_from_ia_async(self, iaData):
        async with aiohttp.ClientSession(connector = aiohttp.TCPConnector(limit = 10)) as session:
            pending = []
            for key in iaData:
                pending.append(asyncio.ensure_future(self.fetch_single(key, session)))

            done, pending = await asyncio.wait(pending)
            assert len(pending) == 0
            for task in done:
                key, response = await task
                iaData[key] = self.parse_ia_response(key, response)

    def fetch_from_ia(self, iaData: dict):
        self._log.log('Fetching dedupe info from IA')
        loop = asyncio.get_event_loop()
        loop.run_until_complete(self.fetch_from_ia_async(iaData))
        loop.close()
        self._log.log('Fetched dedupe info from IA')

    def parse_ia_response(self, key, response):
        # Parse response (str or None), returns False if an error occurred, None if there is no previous record, or a dict if there is.
        if response is None:
            self._log.log('Key {} got no or a bad CDX API response.'.format(key))
            return False
        if len(response.strip()) == 0:
            return None
        if 'org.archive.wayback.exception.RobotAccessControlException' in response:
            self._log.log('Key {} is blocked by robots.txt.'.format(key))
            return False
        if 'org.archive.wayback.exception.AdministrativeAccessControlException' in response:
            self._log.log('Key {} is excluded from the CDX API.'.format(key))
            return False
        if 'Requested Line is too large' in response:
            self._log.log('Key {} has a too large URL.'.format(key))
            return False
        for line in response.splitlines():
            if not re.search('^[0-9]{14}\s+https?://', line):
                continue
            break
        else:
            self._log.log('Key {} for an invalid CDX API response'.format(key))
            return False
        data = line.strip().split(' ', 1)
        return {
            'target-uri': data[1],
            'date': datetime.datetime.strptime(data[0], '%Y%m%d%H%M%S'). \
                strftime('%Y-%m-%dT%H:%M:%SZ')
        }

    @property
    def warc_target(self) -> str:
        return self._warc_target

    @warc_target.setter
    def warc_target(self, value: str):
        if value is not None:
            self._warc_target = value
        self._warc_target = create_warc_target(self.warc_source)
예제 #4
0
class Warc:
    def __init__(self, warc_source: str, warc_target: str=None):
        self.warc_source = warc_source
        self.warc_target = warc_target
        self._response_records = {}
        self._log = Log()
        self._log.log('Original WARC file is {}.'.format(self.warc_source))
        self._log.log('Deduplicated WARC file is {}.'.format(self.warc_target))
        if os.path.isfile(self.warc_target):
            self._log.log('File {} already exists.'.format(self.warc_target))
            raise Exception('File {} already exists.'.format(self.warc_target))

    def deduplicate(self):
        self._log.log('Start deduplication process.')
        with open(self.warc_source, 'rb') as s, \
                open(self.warc_target, 'wb') as t:
            writer = WARCWriter(filebuf=t, gzip=self.warc_target.endswith('.gz'))
            for record in ArchiveIterator(s):
                url = record.rec_headers.get_header('WARC-Target-URI')
                record_id = record.rec_headers.get_header('WARC-Record-ID')
                self._log.log('Processing record {}.'.format(record_id))
                if url is not None and url.startswith('<'):
                    url = re.search('^<(.+)>$', url).group(1)
                    self._log.log('Replacing URL in record {} with {}.'
                                  .format(record_id, url))
                    record.rec_headers.replace_header('WARC-Target-URI', url)
                if record.rec_headers.get_header('WARC-Type') == 'response':
                    data = self.get_duplicate(record)
                    if data:
                        self._log.log('Record {} is duplicate from {}.'
                                      .format(record_id, data))
                        writer.write_record(
                            self.response_to_revisit(writer, record, data)
                        )
                    else:
                        self.register_response(record)
                        writer.write_record(record)
                elif record.rec_headers.get_header('WARC-Type') == 'warcinfo':
                    self._log.set_warcinfo(record.rec_headers.get_header('WARC-Record-ID'))
                    record.rec_headers.replace_header('WARC-Filename', self.warc_target)
                    writer.write_record(record)
                else:
                    writer.write_record(record)
            self._log.log('Writing log to WARC.')
            writer.write_record(self._log.create_record(writer))

    def register_response(self, record):
        key = (
            record.rec_headers.get_header('WARC-Payload-Digest'),
            record.rec_headers.get_header('WARC-Target-URI')
        )
        self._response_records[key] = {
            'record-id': record.rec_headers.get_header('WARC-Record-ID'),
            'date': record.rec_headers.get_header('WARC-Date'),
            'target-uri': record.rec_headers.get_header('WARC-Target-URI')
        }

    @staticmethod
    def response_to_revisit(writer, record, data):
        warc_headers = record.rec_headers
        if 'record-id' in data and data['record-id'] is not None:
            warc_headers.replace_header('WARC-Refers-To', data['record-id'])
        warc_headers.replace_header('WARC-Refers-To-Date', data['date'])
        warc_headers.replace_header('WARC-Refers-To-Target-URI',
                                    data['target-uri'])
        warc_headers.replace_header('WARC-Type', 'revisit')
        warc_headers.replace_header('WARC-Truncated', 'length')
        warc_headers.replace_header('WARC-Profile',
                                    'http://netpreserve.org/warc/1.0/' \
                                    'revisit/identical-payload-digest')
        warc_headers.remove_header('WARC-Block-Digest')
        warc_headers.remove_header('Content-Length')
        return writer.create_warc_record(
            record.rec_headers.get_header('WARC-Target-URI'),
            'revisit',
            warc_headers=warc_headers,
            http_headers=record.http_headers
        )

    def get_duplicate(self, record):
        key = (
            record.rec_headers.get_header('WARC-Payload-Digest'),
            record.rec_headers.get_header('WARC-Target-URI')
        )
        if key in self._response_records:
            return self._response_records[key]
        return self.get_ia_duplicate(record)

    @staticmethod
    def get_ia_duplicate(record):
        date = record.rec_headers.get_header('WARC-Date')
        date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
        date = date.strftime('%Y%m%d%H%M%S')
        digest = record.rec_headers.get_header('WARC-Payload-Digest')
        uri = record.rec_headers.get_header('WARC-Target-URI')
        r = requests.get(
            'http://wwwb-dedup.us.archive.org:8083/cdx/search'
            '?url={}'.format(uri) +
            '&limit=1'
            '&filter=digest:{}'.format(digest.split(':')[1]) +
            '&fl=original,timestamp'
            '&to={}'.format(int(date) - 1) +
            '&filter=!mimetype:warc\/revisit'
        )
        r = r.text.strip()
        if len(r) == 0:
            return None
        r = r.split(' ', 1)
        return {
            'target-uri': r[0],
            'date': datetime.datetime.strptime(r[1], '%Y%m%d%H%M%S'). \
                strftime('%Y-%m-%dT%H:%M:%SZ')
        }

    @property
    def warc_target(self) -> str:
        return self._warc_target

    @warc_target.setter
    def warc_target(self, value: str):
        if value is not None:
            self._warc_target = value
        self._warc_target = create_warc_target(self.warc_source)