def get_change(self, *, page_id, to_version_id, from_version_id=''): """ Get a Changes between two Versions. Parameters ---------- page_id : string to_version_id : string from_version_id : string, optional If from_version_id is not given, it will be treated as version immediately prior to ``to_version``. Returns ------- response : dict """ url = (f'/pages/{page_id}/changes/' f'{from_version_id}..{to_version_id}') result = self.request_json(GET, url) # In place, replace datetime strings with datetime objects. data = result['data'] # For changes which were created just-in-time to fulfill this API request, # created/updated_at will be None, and that's OK. if data['created_at'] and data['updated_at']: data['created_at'] = parse_timestamp(data['created_at']) data['updated_at'] = parse_timestamp(data['updated_at']) return result
def get_version(self, version_id, include_change_from_previous=None, include_change_from_earliest=None): """ Lookup a specific Version by ID. Parameters ---------- version_id : string include_change_from_previous : boolean, optional If True, include a `change_from_previous` field in that represents a change object between this and the previous version of the same page. include_change_from_earliest : boolean, optional If True, include a `change_from_earliest` field in that represents a change object between this and the earliest version of the same page. Returns ------- response : dict """ url = f'/versions/{version_id}' params = { 'include_change_from_previous': include_change_from_previous, 'include_change_from_earliest': include_change_from_earliest } result = self.request_json(GET, url, params=params) data = result['data'] data['capture_time'] = parse_timestamp(data['capture_time']) data['updated_at'] = parse_timestamp(data['updated_at']) data['created_at'] = parse_timestamp(data['created_at']) return result
def list_changes(self, page_id, include_total=False): """ List Changes between two Versions on a Page. Parameters ---------- page_id : string include_total : boolean, optional Whether to include a `meta.total_results` field in the response. If not set, `links.last` will usually be empty unless you are on the last chunk. Setting this option runs a pretty expensive query, so use it sparingly. (Default: False) Returns ------- response : dict """ url = f'/pages/{page_id}/changes/' result = self.request_json( GET, url, params={'include_total': include_total or None}) # In place, replace datetime strings with datetime objects. for change in result['data']: change['created_at'] = parse_timestamp(change['created_at']) change['updated_at'] = parse_timestamp(change['updated_at']) return result
def list_versions(self, *, page_id=None, chunk=None, chunk_size=None, start_date=None, end_date=None, source_type=None, hash=None, source_metadata=None): """ List Versions, optionally filtered by serach criteria, including Page. Parameters ---------- page_id : string, optional restricts serach to Versions of a specific Page chunk : integer, optional pagination parameter chunk_size : integer, optional number of items per chunk start_date : datetime, optional end_date : datetime, optional source_type : string, optional such as 'versionista' or 'internetarchive' hash : string, optional SHA256 hash of Version content source_metadata : dict, optional Examples: * ``{'version_id': 12345678}`` * ``{'account': 'versionista1', 'has_content': True}`` Returns ------- response : dict """ params = { 'chunk': chunk, 'chunk_size': chunk_size, 'capture_time': _time_range_string(start_date, end_date), 'source_type': source_type, 'hash': hash } if source_metadata is not None: for k, v in source_metadata.items(): params[f'source_metadata[{k}]'] = v if page_id is None: url = f'{self._api_url}/versions' else: url = f'{self._api_url}/pages/{page_id}/versions' res = requests.get(url, auth=self._auth, params=params) _process_errors(res) result = res.json() # In place, replace datetime strings with datetime objects. for v in result['data']: v['created_at'] = parse_timestamp(v['created_at']) v['updated_at'] = parse_timestamp(v['updated_at']) v['capture_time'] = parse_timestamp(v['capture_time']) return result
def get_annotation(self, *, annotation_id, page_id, to_version_id, from_version_id=''): """ Get a specific Annontation. Parameters ---------- annotation_id : string page_id : string to_version_id : string from_version_id : string, optional If from_version_id is not given, it will be treated as version immediately prior to ``to_version``. Returns ------- response : dict """ url = (f'{self._api_url}/pages/{page_id}/changes/' f'{from_version_id}..{to_version_id}/annotations/' f'{annotation_id}') res = requests.get(url, auth=self._auth) _process_errors(res) result = res.json() # In place, replace datetime strings with datetime objects. data = result['data'] data['created_at'] = parse_timestamp(data['created_at']) data['updated_at'] = parse_timestamp(data['updated_at']) return result
def handle(self, *args, **options): from_date = parse_timestamp(options["from"]) to_date = parse_timestamp(options["to"]) uploads = UploadEvent.objects.filter(created__gte=from_date, created__lte=to_date) queue_upload_events_for_reprocessing(uploads, use_kinesis=True)
def endElement(self, tag): if tag == 'title': self._note['title'] = self._data elif tag == 'author': self._note['author'] = self._data elif tag == 'created': dt = parse_timestamp(self._data) self._note['created'] = dt elif tag == 'updated': dt = parse_timestamp(self._data) self._note['updated'] = dt elif tag == 'content': self._note['content'] = self._data.strip() elif tag == 'note': self._exporter.export(self._note) self._note = None elif tag == 'data': self._attachment['data'] = b64decode(self._data) elif tag == 'mime': self._attachment['mimetype'] = self._data elif tag == 'file-name': self._attachment['filename'] = self._data elif tag == 'resource': if 'attachments' not in self._note: self._note['attachments'] = [] self._note['attachments'].append(self._attachment) self._attachments = None else: return self._capture = False
def get_page(self, page_id): """ Lookup a specific Page by ID. Parameters ---------- page_id : string Returns ------- response : dict """ url = f'{self._api_url}/pages/{page_id}' res = requests.get(url, auth=self._auth) _process_errors(res) result = res.json() # In place, replace datetime strings with datetime objects. data = result['data'] data['created_at'] = parse_timestamp(data['created_at']) data['updated_at'] = parse_timestamp(data['updated_at']) for v in data['versions']: v['created_at'] = parse_timestamp(v['created_at']) v['updated_at'] = parse_timestamp(v['updated_at']) v['capture_time'] = parse_timestamp(v['capture_time']) return result
def list_annotations(self, *, page_id, to_version_id, from_version_id='', include_total=False): """ List Annotations for a Change between two Versions. Parameters ---------- page_id : string to_version_id : string from_version_id : string, optional If from_version_id is not given, it will be treated as version immediately prior to ``to_version``. include_total : boolean, optional Whether to include a `meta.total_results` field in the response. If not set, `links.last` will usually be empty unless you are on the last chunk. Setting this option runs a pretty expensive query, so use it sparingly. (Default: False) Returns ------- response : dict """ url = (f'/pages/{page_id}/changes/' f'{from_version_id}..{to_version_id}/annotations') result = self.request_json( GET, url, params={'include_total': include_total or None}) # In place, replace datetime strings with datetime objects. for a in result['data']: a['created_at'] = parse_timestamp(a['created_at']) a['updated_at'] = parse_timestamp(a['updated_at']) return result
def list_annotations(self, *, page_id, to_version_id, from_version_id=''): """ List Annotations for a Change between two Versions. Parameters ---------- page_id : string to_version_id : string from_version_id : string, optional If from_version_id is not given, it will be treated as version immediately prior to ``to_version``. Returns ------- response : dict """ url = (f'{self._api_url}/pages/{page_id}/changes/' f'{from_version_id}..{to_version_id}/annotations') res = requests.get(url, auth=self._auth) _process_errors(res) result = res.json() # In place, replace datetime strings with datetime objects. for a in result['data']: a['created_at'] = parse_timestamp(a['created_at']) a['updated_at'] = parse_timestamp(a['updated_at']) return result
def test_04_last_run(self): task1_id = set_periodic_task("task one", "*/5 * * * *", ["pinode1", "pinode2"], "some.task.module", 3, { "key1": 1, "key2": False }) self.assertEqual(len(PeriodicTask.query.all()), 1) task1_entry = PeriodicTask.query.filter_by(id=task1_id).one() # We have no initial last runs self.assertEqual(len(list(task1_entry.last_runs)), 0) set_periodic_task_last_run(task1_id, "pinode1", parse_timestamp("2018-06-26 08:00+02:00")) set_periodic_task_last_run(task1_id, "pinode1", parse_timestamp("2018-06-26 08:05+02:00")) task1 = get_periodic_tasks("task one")[0] self.assertEqual(len(list(task1_entry.last_runs)), 1) self.assertEqual(task1_entry.last_runs[0].timestamp, parse_timestamp("2018-06-26 06:05")) self.assertEqual(task1["last_runs"]["pinode1"], parse_timestamp("2018-06-26 06:05 UTC")) set_periodic_task_last_run(task1_id, "pinode2", parse_timestamp("2018-06-26 08:10+01:00")) set_periodic_task_last_run(task1_id, "pinode3", parse_timestamp("2018-06-26 08:10-08:00")) task1 = get_periodic_tasks("task one")[0] self.assertEqual(task1["last_runs"]["pinode1"], parse_timestamp("2018-06-26 06:05 UTC")) self.assertEqual(task1["last_runs"]["pinode2"], parse_timestamp("2018-06-26 07:10 UTC")) self.assertEqual(task1["last_runs"]["pinode3"], parse_timestamp("2018-06-26 16:10 UTC")) delete_periodic_task(task1_id)
def __init__(self, data): if data.get('created'): self.created = (parse_timestamp(data['created'])) if data.get('createdBy'): self.created_by = Profile(data['createdBy']) if data.get('edited'): self.edited = (parse_timestamp(data['edited'])) if data.get('editedBy'): self.edited_by = Profile(data['editedBy']) if data.get('flags'): self.flags = data['flags'] if data.get('permissions'): self.permissions = PermissionSet(data['permissions']) if data.get('links'): self.links = {} for item in data['links']: if 'title' in item: self.links[item['rel']] = {'href': str.replace(str(item['href']),'/api/v1',''), 'title': item['title']} else: self.links[item['rel']] = {'href': str.replace(str(item['href']),'/api/v1','')}
def process_timestamp(resource): """ Recurse over unmarshalled json and convert any strings that are ISO8601-like into python datetime objects. This is far from ideal and will be replaced in future with xpath-like notation for visiting specific attributes. Args: resource: an JSON API response that has been deserialized. This will usually be a dictionary but could also be a list. Returns: the same resource, but with timestamp strings as datetime objects. """ if isinstance(resource, list): for item in resource: APIResource.process_timestamp(item) else: for key in resource.keys(): if isinstance(resource[key], unicode): if bool(VALID_DATETIME.search(resource[key])): resource[key] = parse_timestamp(resource[key]) elif isinstance(resource[key], list) or isinstance( resource[key], dict): APIResource.process_timestamp(resource[key]) return resource
def ensure_tztime(ts): if isinstance(ts, str): ts = parse_timestamp(ts) try: return pytz.utc.localize(ts) except: return ts
def process_timestamp(resource): """ Recurse over unmarshalled json and convert any strings that are ISO8601-like into python datetime objects. This is far from ideal and will be replaced in future with xpath-like notation for visiting specific attributes. Args: resource: an JSON API response that has been deserialized. This will usually be a dictionary but could also be a list. Returns: the same resource, but with timestamp strings as datetime objects. """ if isinstance(resource, list): for item in resource: APIResource.process_timestamp(item) else: for key in resource.keys(): if isinstance(resource[key], unicode): if bool(VALID_DATETIME.search(resource[key])): resource[key] = parse_timestamp(resource[key]) elif isinstance(resource[key], list) or isinstance(resource[key], dict): APIResource.process_timestamp(resource[key]) return resource
def from_api_response(cls, data): file_metadata = cls() file_metadata.created = parse_timestamp(data[0]['created']) file_metadata.file_size = data[0]['fileSize'] file_metadata.file_hash = data[0]['fileHash'] file_metadata.mime_type = data[0]['mimeType'] if data[0].get('fileExt'): file_metadata.file_ext = data[0]['fileExt'] return file_metadata
def parse_cmd_timestamp(args): if len(args) == 0: return datetime.now() elif args[0][0] == '+': s = timedelta(seconds=int(args[0][1:])) timestamp = datetime.now() + s else: timestamp = parse_timestamp(args[0]) return timestamp
def get_console_output(self, xml_bytes): root = XML(xml_bytes) output_node = root.find("output") instance_id = root.find("instanceId").text.decode("ascii").strip() timestamp = parse_timestamp(root.find("timestamp").text) console_text = output_node.text.decode("base64").decode("utf-8") return model.ConsoleOutput( instance_id, timestamp, console_text, )
def load_from_web_dict (d): s = d["OrderType"] if "BUY" == s: c = Buy_op elif "SELL" == s: c = Sell_op else: m = "'{}' is wrong string for parsing of Operation's type" raise Exception(m.format(s)) return c(parse_timestamp(d["TimeStamp"]))
def __init__(self, data, summary=True): """ We're permissive about the data passed in, since it may be a PUT or PATCH operation and not have all the expected keys. """ if data.get('id'): self.id = data['id'] if data.get('siteId'): self.site_id = data['siteId'] if data.get('userId'): self.user_id = data['userId'] if data.get('email'): self.email = data['email'] if data.get('profileName'): self.profile_name = data['profileName'] if data.get('visible'): self.visible = data['visible'] if data.get('avatar'): self.avatar = data['avatar'] if data.get('meta'): self.meta = Meta(data['meta']) if not summary: self.style_id = data['styleId'] self.item_count = data['itemCount'] self.comment_count = data['commentCount'] self.created = parse_timestamp(data['created']) self.last_active = parse_timestamp(data['lastActive'])
def get_version(self, version_id): """ Lookup a specific Version by ID. Parameters ---------- version_id : string Returns ------- response : dict """ url = f'{self._api_url}/versions/{version_id}' res = requests.get(url, auth=self._auth) _process_errors(res) result = res.json() data = result['data'] data['capture_time'] = parse_timestamp(data['capture_time']) data['updated_at'] = parse_timestamp(data['updated_at']) data['created_at'] = parse_timestamp(data['created_at']) return result
def list_changes(self, page_id): """ List Changes between two Versions on a Page. Parameters ---------- page_id : string Returns ------- response : dict """ url = f'{self._api_url}/pages/{page_id}/changes/' res = requests.get(url, auth=self._auth) _process_errors(res) result = res.json() # In place, replace datetime strings with datetime objects. for change in result['data']: change['created_at'] = parse_timestamp(change['created_at']) change['updated_at'] = parse_timestamp(change['updated_at']) return result
def validate_and_clean_timestamp(self, data, field_name): value = data.get(field_name) error_msg = "Field '{}' is required and expected to be an ISO 8601 UTC timestamp".format(field_name) if not value: raise OrderStatusValidationError(error_msg) try: value = parse_timestamp(value) except ValueError: raise OrderStatusValidationError(error_msg) data[field_name] = value
def _normalize_last_modified(self, last_modified): """ Normalize timestamp of S3 key which is from bucket.get_key method. :param last_modified: :return: """ try: mtime = parse_timestamp(last_modified) return mtime.strftime('%Y-%m-%dT%H:%M:%S.000Z') except Exception: logger.exception("Failed to normalize last modified time.", minimum_last_modified=last_modified) raise
def get_page(self, page_id): """ Lookup a specific Page by ID. Parameters ---------- page_id : string Returns ------- response : dict """ url = f'/pages/{page_id}' result = self.request_json(GET, url) # In place, replace datetime strings with datetime objects. data = result['data'] data['created_at'] = parse_timestamp(data['created_at']) data['updated_at'] = parse_timestamp(data['updated_at']) for v in data['versions']: v['created_at'] = parse_timestamp(v['created_at']) v['updated_at'] = parse_timestamp(v['updated_at']) v['capture_time'] = parse_timestamp(v['capture_time']) return result
def from_xml(cls, xml): if xml.tag != cls.tagname: raise ValueError("%s.from_xml() called with %r, not %r" % (cls.__name__, xml.tag, cls.tagname)) ts = xml.attrib.get("ts") if ts: ts = parse_timestamp(ts) ret = cls(ts) for element in xml: ecls = node_for_tagname(element.tag) node = ecls.from_xml(element) for attrname in ecls.attributes: setattr(node, attrname, element.attrib.get(attrname)) ret.nodes.append(node) return ret
def convert_datetime_params(params: Dict[str, Any]) -> Dict[str, Any]: """Convert any dates, datetimes, or timestamps in other formats into ISO 8601 strings. API behavior note: params that take date but not time info will accept a full timestamp and just ignore the time, so it's safe to parse both date and datetime strings into timestamps :raises: :py:exc:`dateutil.parser._parser.ParserError` if a date/datetime format is invalid """ for k, v in params.items(): if isinstance(v, datetime) or isinstance(v, date): params[k] = _isoformat(v) if k in DATETIME_PARAMS: params[k] = _isoformat(parse_timestamp(v)) return params
def published_at(self): date = NULL_DATE def dt_specificity(dt): return sum(1 if date.__getattribute__(k) else 0 for k in ('month', 'day', 'hour', 'minute', 'second', 'microsecond', 'tzinfo')) if "lastmod" in self.sitemap_data and self.sitemap_data[ "lastmod"] and (datetime.datetime.now() - self.sitemap_data['lastmod'] ) < datetime.timedelta(days=365 * 10): date = self.sitemap_data["lastmod"] else: # print(f"Guessing date for {self.url}...") # guess = guess_date(url=self.url, html=self.html) # date = guess.date or NULL_DATE # if date is not NULL_DATE: # print(f"Found a date: {guess.date}! Accuracy is to: {guess.accuracy}") # else: # print( # f"No date extracted. Attempting to find a date in the metadata..." # ) most_specific = None curr_specificity = 0 seen = set() for match in self.lxml.xpath(UniversalSelector.published_at): if match in seen: continue seen.add(match) try: date = parse_timestamp( str(re.sub(r"\n", " ", match.strip()))) print(f"Found a date! {date}") specificity = dt_specificity(date) if specificity > curr_specificity: most_specific = date curr_specificity = specificity except: pass if most_specific: date = most_specific try: date = date.replace(tzinfo=None) except: pass return date
def validate_and_clean_time(self, data, field_name): value = data.get(field_name) error_msg = "Field '{}' is required and expected to be a 24-hour time string HH:MM".format(field_name) if not isinstance(value, basestring): raise OrderStatusValidationError(error_msg) if not re.match('^\d\d?:\d\d$', value): raise OrderStatusValidationError(error_msg) try: value = parse_timestamp(value).time() except ValueError: raise OrderStatusValidationError(error_msg) data[field_name] = value
def validate_and_clean_time(self, data, field_name): value = data.get(field_name) error_msg = "Field '{}' is required and expected to be a 24-hour time string HH:MM".format(field_name) if not isinstance(value, str): raise OrderStatusValidationError(error_msg) if not re.match(r'^\d\d?:\d\d$', value): raise OrderStatusValidationError(error_msg) try: value = parse_timestamp(value).time() except ValueError: raise OrderStatusValidationError(error_msg) data[field_name] = value
def from_xml(cls, xml): if xml.tag != cls.tagname: raise ValueError("%s.from_xml() called with %r, not %r" % ( cls.__name__, xml.tag, cls.tagname )) ts = xml.attrib.get("ts") if ts: ts = parse_timestamp(ts) ret = cls(ts) for element in xml: ecls = node_for_tagname(element.tag) node = ecls.from_xml(element) for attrname in ecls.attributes: setattr(node, attrname, element.attrib.get(attrname)) ret.nodes.append(node) return ret
def parse_document_timestamps(doc, date_attrs=('modified', 'created')): """Converts the `modified' and `created' dates from ISO 8601 format to a date time object for the given document. """ for date in date_attrs: date_str = doc.get(date) try: date_obj = parse_timestamp(date_str) except ValueError: logging.error('Error trying to parse "%s"', date_str) date_obj = None doc.update({date: date_obj}) return doc
def __init__(self, xml): url = xml.find("loc") lastmod = xml.find("lastmod") title = xml.find("news:title") description = xml.find("news:description") keywords = xml.find("news:keywords") publication_date = xml.find("news:publication_date") if not title: title = xml.find("video:title") if not description: description = xml.find("video:description") self.url = format_text(url_normalize(url.text.strip().lower())) self.html = "" self.tree = None parsed = urlparse(self.url) self.site = parsed.netloc self.path = parsed.path try: pardir = "/".join(re.sub(r"(/)$", "", self.path).split("/")[:-2]) except: pardir = "/" self.base_url = f"{parsed.scheme}://{parsed.netloc}{pardir}" self.lastmod = parse_timestamp(format_text( lastmod.text)) if lastmod else None self.headline = format_text(title.text.strip()) if title else "" self.keywords = ([format_text(kw) for kw in keywords.text.split(",")] if keywords else []) self.publication_date = (format_text(publication_date.text) if publication_date else "") self.description = format_text(description.text) if description else "" self.xml = format_text(xml.__repr__()) self.metadata = {"schemata": [], "errors": []} self.has_metadata = False self.seen = self.url in seen # seen.add(self.url) self.articlebody = "" self.visited = False
def test_01_crud(self): # no tasks yet status_code, data = self.simulate_request('/periodictask/', method='GET') self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(data['result']['value'], []) # need authorization status_code, data = self.simulate_request( '/periodictask/', method='GET', headers={'Authorization': 'ABC'}) self.assertEqual(status_code, 401) self.assertFalse(data['result']['status']) # create task with self.mock_task_module(): task_dict1 = { 'name': 'some task', 'nodes': 'pinode1, pinode2', 'active': False, 'interval': '0 8 * * *', 'taskmodule': 'UnitTest', 'ordering': 5, 'options': '{"something": 123, "else": true}', } status_code, data = self.simulate_request('/periodictask/', method='POST', data=task_dict1) self.assertEqual(status_code, 200) self.assertEqual(data['result']['status'], True) ptask_id1 = data['result']['value'] # some invalid tasks invalid_task_dicts = [ # invalid ordering { 'name': 'some other task', 'active': False, 'nodes': 'a, b', 'interval': '0 8 * * *', 'taskmodule': 'UnitTest', 'ordering': '-3', 'options': '{"something": "123", "else": true}', }, # no nodes { 'name': 'some other task', 'active': False, 'interval': '0 8 * * *', 'taskmodule': 'UnitTest', 'options': '{"something": "123", "else": true}', }, # empty nodes { 'name': 'some other task', 'active': False, 'interval': '0 8 * * *', 'nodes': ' ', 'taskmodule': 'UnitTest', 'options': '{"something": "123", "else": true}', }, # unknown taskmodule { 'name': 'some other task', 'nodes': 'pinode1, pinode2', 'active': False, 'interval': '0 8 * * *', 'taskmodule': 'Unknown', 'options': '{"something": "123"}', }, # invalid interval { 'name': 'some other task', 'nodes': 'pinode1, pinode2', 'active': False, 'interval': 'every day', 'taskmodule': 'UnitTest', 'options': '{"something": "123"}', }, # invalid options { 'name': 'some task', 'nodes': 'pinode1, pinode2', 'active': False, 'interval': '0 8 * * *', 'taskmodule': 'UnitTest', 'options': '[1, 2]', } ] # all result in ERR905 with self.mock_task_module(): for invalid_task_dict in invalid_task_dicts: status_code, data = self.simulate_request( '/periodictask/', method='POST', data=invalid_task_dict) self.assertEqual(status_code, 400) self.assertFalse(data['result']['status']) self.assertIn('ERR905', data['result']['error']['message']) # create another task with self.mock_task_module(): task_dict2 = { 'name': 'some other task', 'nodes': 'pinode1', 'active': False, 'interval': '0 8 * * 0', 'taskmodule': 'UnitTest', 'ordering': 2, } status_code, data = self.simulate_request('/periodictask/', method='POST', data=task_dict2) self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) ptask_id2 = data['result']['value'] # can list the periodic tasks status_code, data = self.simulate_request('/periodictask/', method='GET') self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(len(data['result']['value']), 2) self.assertEqual([task['name'] for task in data['result']['value']], ['some other task', 'some task']) # find first task result_dict = data['result']['value'][1] self.assertEqual(result_dict['id'], ptask_id1) self.assertEqual(result_dict['ordering'], 5) self.assertEqual(result_dict['name'], 'some task') self.assertEqual(result_dict['active'], False) self.assertEqual(result_dict['interval'], '0 8 * * *') self.assertEqual(result_dict['nodes'], ['pinode1', 'pinode2']) self.assertEqual(result_dict['last_runs'], {}) last_update = parse_timestamp(result_dict['last_update']) self.assertIsNotNone(last_update) self.assertEqual(result_dict['options'], { 'something': '123', 'else': 'True' }) # get one status_code, data = self.simulate_request( '/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value']['id'], ptask_id1) # unknown ID status_code, data = self.simulate_request('/periodictask/4242', method='GET') self.assertEqual(status_code, 404) self.assertFalse(data['result']['status']) # update existing task task_dict1['name'] = 'new name' task_dict1['options'] = '{"key": "value"}' task_dict1['id'] = ptask_id1 task_dict1['ordering'] = '2' with self.mock_task_module(): status_code, data = self.simulate_request('/periodictask/', method='POST', data=task_dict1) self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(data['result']['value'], ptask_id1) # can list the periodic tasks in new order status_code, data = self.simulate_request('/periodictask/', method='GET') self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(len(data['result']['value']), 2) self.assertEqual([task['name'] for task in data['result']['value']], ['new name', 'some other task']) # get updated task status_code, data = self.simulate_request( '/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value']['id'], ptask_id1) self.assertEqual(data['result']['value']['ordering'], 2) self.assertEqual(data['result']['value']['name'], 'new name') self.assertEqual(data['result']['value']['options'], {'key': 'value'}) self.assertGreater( parse_timestamp(data['result']['value']['last_update']), last_update) last_update = parse_timestamp(data['result']['value']['last_update']) # enable status_code, data = self.simulate_request( '/periodictask/enable/{}'.format(ptask_id1), method='POST') self.assertEqual(status_code, 200) # get updated task status_code, data = self.simulate_request( '/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value']['name'], 'new name') self.assertEqual(data['result']['value']['active'], True) self.assertGreater( parse_timestamp(data['result']['value']['last_update']), last_update) last_update = parse_timestamp(data['result']['value']['last_update']) # disable status_code, data = self.simulate_request( '/periodictask/disable/{}'.format(ptask_id1), method='POST') self.assertEqual(status_code, 200) # get updated task status_code, data = self.simulate_request( '/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value']['name'], 'new name') self.assertEqual(data['result']['value']['active'], False) self.assertGreater( parse_timestamp(data['result']['value']['last_update']), last_update) # disable again without effect status_code, data = self.simulate_request( '/periodictask/disable/{}'.format(ptask_id1), method='POST') self.assertEqual(status_code, 200) # get updated task status_code, data = self.simulate_request( '/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value']['name'], 'new name') self.assertEqual(data['result']['value']['active'], False) # delete status_code, data = self.simulate_request( '/periodictask/{}'.format(ptask_id1), method='DELETE') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value'], ptask_id1) # get updated task impossible now status_code, data = self.simulate_request( '/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 404) self.assertFalse(data['result']['status'], False) # only 1 task left status_code, data = self.simulate_request('/periodictask/', method='GET') self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(len(data['result']['value']), 1) # delete the second task as well status_code, data = self.simulate_request( '/periodictask/{}'.format(ptask_id2), method='DELETE') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value'], ptask_id2) # no tasks left status_code, data = self.simulate_request('/periodictask/', method='GET') self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(data['result']['value'], [])
from dateutil.parser import parse as parse_timestamp from math import ceil from sys import argv a_string, b_string = argv[1:] try: a_timestamp = parse_timestamp(a_string) except ValueError: exit('a_timestamp.error = could not parse timestamp (%s)' % a_string) try: b_timestamp = parse_timestamp(b_string) except ValueError: exit('b_timestamp.error = could not parse timestamp (%s)' % b_string) second_count = (b_timestamp - a_timestamp).total_seconds() minute_count = second_count / 60. hour_count = minute_count / 60. day_count = hour_count / 24. if day_count > 1: print('day_count = %s' % int(ceil(day_count))) elif hour_count > 1: print('hour_count = %s' % int(ceil(hour_count))) elif minute_count > 1: print('minute_count = %s' % int(ceil(minute_count))) else: print('second_count = %s' % int(ceil(second_count)))
def test_05_scheduling(self): # this unit test operates in russian time tzinfo = gettz("Europe/Moscow") # at midnight on each 1st task1 = set_periodic_task("task one", "0 0 1 * *", ["pinode1"], "some.task.module", 3, { "key1": 1, "key2": False }) # at 08:00 on wednesdays current_utc_time = parse_timestamp("2018-05-31 05:08:00") with mock.patch('privacyidea.models.datetime') as mock_dt: mock_dt.utcnow.return_value = current_utc_time task2 = set_periodic_task("task two", "0 8 * * WED", ["pinode2", "pinode3"], "some.task.module", 1, { "key1": "value", "key2": "foo" }, active=False) self.assertEqual(get_periodic_task_by_id(task2)["last_update"], parse_timestamp("2018-05-31 08:08:00+03:00")) self.assertEqual(get_periodic_task_by_id(task2)["last_runs"], {}) # every 30 minutes, on Tuesdays task3 = set_periodic_task("task three", "*/30 * * * 2", ["pinode1", "pinode2"], "some.task.module", 2, { "key1": 1234, "key2": 5678, }) # on each 1st of august at midnight task4 = set_periodic_task("task four", "0 0 1 8 *", ["pinode2"], "some.task.module", 0) # we need some last runs set_periodic_task_last_run(task1, "pinode1", parse_timestamp("2018-06-01 00:00:05+03:00")) # no last run for pinode3 here! set_periodic_task_last_run(task2, "pinode2", parse_timestamp("2018-06-20 08:00:05+03:00")) set_periodic_task_last_run(task3, "pinode1", parse_timestamp("2018-06-26 11:36:37+03:00")) set_periodic_task_last_run(task3, "pinode2", parse_timestamp("2018-06-26 11:30:33+03:00")) set_periodic_task_last_run(task4, "pinode2", parse_timestamp("2017-08-01 00:00:43+03:00")) self.assertEqual([task["name"] for task in get_periodic_tasks()], ["task four", "task two", "task three", "task one"]) # Invalid timestamp with self.assertRaises(ParameterError): get_scheduled_periodic_tasks("pinode1", parse_timestamp("2017-08-01 00:00:00"), tzinfo) # On pinode1: # task1 at midnight on each 1st # task3 every 30 minutes on tuesdays # On pinode2: # task2 on 08:00 on wednesdays, but it is inactive # task3 every 30 minutes on tuesdays # task4 on each 1st August at midnight # 26th June (Tuesday), 11:59 # No tasks on both nodes current_timestamp = parse_timestamp("2018-06-26 11:59+03:00") scheduled = get_scheduled_periodic_tasks("pinode1", current_timestamp, tzinfo) self.assertEqual(scheduled, []) scheduled = get_scheduled_periodic_tasks("pinode2", current_timestamp, tzinfo) self.assertEqual(scheduled, []) # 26th June (Tuesday), 12:00 # Run task3 on both nodes current_timestamp = parse_timestamp("2018-06-26 12:00+03:00") scheduled = get_scheduled_periodic_tasks("pinode1", current_timestamp, tzinfo) self.assertEqual([task["name"] for task in scheduled], ["task three"]) scheduled = get_scheduled_periodic_tasks("pinode2", current_timestamp, tzinfo) self.assertEqual([task["name"] for task in scheduled], ["task three"]) # 1th August (Wednesday), 13:57 # Assume task3 has been run successfully on 30th July (Tuesday) set_periodic_task_last_run(task3, "pinode1", parse_timestamp("2018-08-01 00:00+03:00")) set_periodic_task_last_run(task3, "pinode2", parse_timestamp("2018-08-01 00:00+03:00")) # On pinode1, run task1 # On pinode2, run task4 current_timestamp = parse_timestamp("2018-08-01 11:59+03:00") scheduled = get_scheduled_periodic_tasks("pinode1", current_timestamp, tzinfo) self.assertEqual([task["name"] for task in scheduled], ["task one"]) scheduled = get_scheduled_periodic_tasks("pinode2", current_timestamp, tzinfo) self.assertEqual([task["name"] for task in scheduled], ["task four"]) # Enable task2, now we also have to run it on pinode2 and pinode3 with mock.patch('privacyidea.models.datetime') as mock_dt: mock_dt.utcnow.return_value = current_utc_time enable_periodic_task(task2) scheduled = get_scheduled_periodic_tasks("pinode1", current_timestamp, tzinfo) self.assertEqual([task["name"] for task in scheduled], ["task one"]) scheduled = get_scheduled_periodic_tasks("pinode2", current_timestamp, tzinfo) self.assertEqual([task["name"] for task in scheduled], ["task four", "task two"]) scheduled = get_scheduled_periodic_tasks("pinode3", current_timestamp, tzinfo) self.assertEqual([task["name"] for task in scheduled], ["task two"]) # Simulate runs set_periodic_task_last_run(task1, "pinode1", current_timestamp) set_periodic_task_last_run(task2, "pinode2", current_timestamp) set_periodic_task_last_run(task2, "pinode3", current_timestamp) set_periodic_task_last_run(task4, "pinode2", current_timestamp) # Now, we don't have to run anything current_timestamp += timedelta(seconds=1) self.assertEqual(get_scheduled_periodic_tasks("pinode1", current_timestamp, tzinfo), []) self.assertEqual(get_scheduled_periodic_tasks("pinode2", current_timestamp, tzinfo), []) delete_periodic_task(task1) delete_periodic_task(task2) delete_periodic_task(task3) delete_periodic_task(task4)
def test_01_crud(self): # no tasks yet status_code, data = self.simulate_request('/periodictask/', method='GET') self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(data['result']['value'], []) # need authorization status_code, data = self.simulate_request('/periodictask/', method='GET', headers={'Authorization': 'ABC'}) self.assertEqual(status_code, 401) self.assertFalse(data['result']['status']) # create task with self.mock_task_module(): task_dict1 = { 'name': 'some task', 'nodes': 'pinode1, pinode2', 'active': False, 'interval': '0 8 * * *', 'taskmodule': 'UnitTest', 'ordering': 5, 'options': '{"something": 123, "else": true}', } status_code, data = self.simulate_request('/periodictask/', method='POST', data=task_dict1) self.assertEqual(status_code, 200) self.assertEqual(data['result']['status'], True) ptask_id1 = data['result']['value'] # some invalid tasks invalid_task_dicts = [ # invalid ordering { 'name': 'some other task', 'active': False, 'nodes': 'a, b', 'interval': '0 8 * * *', 'taskmodule': 'UnitTest', 'ordering': '-3', 'options': '{"something": "123", "else": true}', }, # no nodes { 'name': 'some other task', 'active': False, 'interval': '0 8 * * *', 'taskmodule': 'UnitTest', 'options': '{"something": "123", "else": true}', }, # empty nodes { 'name': 'some other task', 'active': False, 'interval': '0 8 * * *', 'nodes': ' ', 'taskmodule': 'UnitTest', 'options': '{"something": "123", "else": true}', }, # unknown taskmodule { 'name': 'some other task', 'nodes': 'pinode1, pinode2', 'active': False, 'interval': '0 8 * * *', 'taskmodule': 'Unknown', 'options': '{"something": "123"}', }, # invalid interval { 'name': 'some other task', 'nodes': 'pinode1, pinode2', 'active': False, 'interval': 'every day', 'taskmodule': 'UnitTest', 'options': '{"something": "123"}', }, # invalid options { 'name': 'some task', 'nodes': 'pinode1, pinode2', 'active': False, 'interval': '0 8 * * *', 'taskmodule': 'UnitTest', 'options': '[1, 2]', } ] # all result in ERR905 with self.mock_task_module(): for invalid_task_dict in invalid_task_dicts: status_code, data = self.simulate_request('/periodictask/', method='POST', data=invalid_task_dict) self.assertEqual(status_code, 400) self.assertFalse(data['result']['status']) self.assertIn('ERR905', data['result']['error']['message']) # create another task with self.mock_task_module(): task_dict2 = { 'name': 'some other task', 'nodes': 'pinode1', 'active': False, 'interval': '0 8 * * 0', 'taskmodule': 'UnitTest', 'ordering': 2, } status_code, data = self.simulate_request('/periodictask/', method='POST', data=task_dict2) self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) ptask_id2 = data['result']['value'] # can list the periodic tasks status_code, data = self.simulate_request('/periodictask/', method='GET') self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(len(data['result']['value']), 2) self.assertEqual([task['name'] for task in data['result']['value']], ['some other task', 'some task']) # find first task result_dict = data['result']['value'][1] self.assertEqual(result_dict['id'], ptask_id1) self.assertEqual(result_dict['ordering'], 5) self.assertEqual(result_dict['name'], 'some task') self.assertEqual(result_dict['active'], False) self.assertEqual(result_dict['interval'], '0 8 * * *') self.assertEqual(result_dict['nodes'], ['pinode1', 'pinode2']) self.assertEqual(result_dict['last_runs'], {}) last_update = parse_timestamp(result_dict['last_update']) self.assertIsNotNone(last_update) self.assertEqual(result_dict['options'], {'something': '123', 'else': 'True'}) # get one status_code, data = self.simulate_request('/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value']['id'], ptask_id1) # unknown ID status_code, data = self.simulate_request('/periodictask/4242', method='GET') self.assertEqual(status_code, 404) self.assertFalse(data['result']['status']) # update existing task task_dict1['name'] = 'new name' task_dict1['options'] = '{"key": "value"}' task_dict1['id'] = ptask_id1 task_dict1['ordering'] = '2' with self.mock_task_module(): status_code, data = self.simulate_request('/periodictask/', method='POST', data=task_dict1) self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(data['result']['value'], ptask_id1) # can list the periodic tasks in new order status_code, data = self.simulate_request('/periodictask/', method='GET') self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(len(data['result']['value']), 2) self.assertEqual([task['name'] for task in data['result']['value']], ['new name', 'some other task']) # get updated task status_code, data = self.simulate_request('/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value']['id'], ptask_id1) self.assertEqual(data['result']['value']['ordering'], 2) self.assertEqual(data['result']['value']['name'], 'new name') self.assertEqual(data['result']['value']['options'], {'key': 'value'}) self.assertGreater(parse_timestamp(data['result']['value']['last_update']), last_update) last_update = parse_timestamp(data['result']['value']['last_update']) # enable status_code, data = self.simulate_request('/periodictask/enable/{}'.format(ptask_id1), method='POST') self.assertEqual(status_code, 200) # get updated task status_code, data = self.simulate_request('/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value']['name'], 'new name') self.assertEqual(data['result']['value']['active'], True) self.assertGreater(parse_timestamp(data['result']['value']['last_update']), last_update) last_update = parse_timestamp(data['result']['value']['last_update']) # disable status_code, data = self.simulate_request('/periodictask/disable/{}'.format(ptask_id1), method='POST') self.assertEqual(status_code, 200) # get updated task status_code, data = self.simulate_request('/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value']['name'], 'new name') self.assertEqual(data['result']['value']['active'], False) self.assertGreater(parse_timestamp(data['result']['value']['last_update']), last_update) # disable again without effect status_code, data = self.simulate_request('/periodictask/disable/{}'.format(ptask_id1), method='POST') self.assertEqual(status_code, 200) # get updated task status_code, data = self.simulate_request('/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value']['name'], 'new name') self.assertEqual(data['result']['value']['active'], False) # delete status_code, data = self.simulate_request('/periodictask/{}'.format(ptask_id1), method='DELETE') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value'], ptask_id1) # get updated task impossible now status_code, data = self.simulate_request('/periodictask/{}'.format(ptask_id1), method='GET') self.assertEqual(status_code, 404) self.assertFalse(data['result']['status'], False) # only 1 task left status_code, data = self.simulate_request('/periodictask/', method='GET') self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(len(data['result']['value']), 1) # delete the second task as well status_code, data = self.simulate_request('/periodictask/{}'.format(ptask_id2), method='DELETE') self.assertEqual(status_code, 200) self.assertEqual(data['result']['value'], ptask_id2) # no tasks left status_code, data = self.simulate_request('/periodictask/', method='GET') self.assertEqual(status_code, 200) self.assertTrue(data['result']['status']) self.assertEqual(data['result']['value'], [])
def test_01_calculate_next_timestamp_utc(self): # The easy case: calculate everything in UTC tzinfo = tzutc() # every day at 08:00 task1 = { "id": 1, "active": True, "name": "task one", "interval": "0 8 * * *", "last_update": parse_timestamp("2018-06-23 07:55:00 UTC"), "nodes": ["foo", "bar", "baz"], "taskmodule": "some.module", "options": {"KEY2": "value number 2", "key 4": "1234"}, "last_runs": { "foo": parse_timestamp("2018-06-25 08:04:30 UTC"), "bar": parse_timestamp("2018-06-24 07:05:37 UTC"), } } self.assertEqual(calculate_next_timestamp(task1, "foo", tzinfo), parse_timestamp("2018-06-26 08:00 UTC")) self.assertEqual(calculate_next_timestamp(task1, "bar", tzinfo), parse_timestamp("2018-06-24 08:00 UTC")) # the next run of baz is calculated based on last_update self.assertEqual(calculate_next_timestamp(task1, "baz", tzinfo), parse_timestamp("2018-06-23 08:00 UTC")) # no last run recorded task1b = { "id": 1, "active": True, "name": "task one", "interval": "0 8 * * *", "last_update": parse_timestamp("2018-06-24 07:55:00 UTC"), "nodes": ["foo", "bar"], "taskmodule": "some.module", "options": {"KEY2": "value number 2", "key 4": "1234"}, "last_runs": {} } self.assertEqual(calculate_next_timestamp(task1b, "foo", tzinfo), parse_timestamp("2018-06-24 08:00 UTC")) self.assertEqual(calculate_next_timestamp(task1b, "bar", tzinfo), parse_timestamp("2018-06-24 08:00 UTC")) # now, "foo" has a last run! task1b["last_runs"]["foo"] = parse_timestamp("2018-06-24 08:00 UTC") self.assertEqual(calculate_next_timestamp(task1b, "foo", tzinfo), parse_timestamp("2018-06-25 08:00 UTC")) # ... bar has still not run self.assertEqual(calculate_next_timestamp(task1b, "bar", tzinfo), parse_timestamp("2018-06-24 08:00 UTC")) # every weekday task2 = { "id": 2, "active": True, "name": "task two", "interval": "0 0 * * 1-5", "last_update": parse_timestamp("2018-06-24 08:00:00 UTC"), "nodes": ["foo", "bar"], "taskmodule": "some.module", "options": {"KEY2": "value number 2", "key 4": "1234"}, "last_runs": { "localhost": parse_timestamp("2018-06-29 00:00 UTC"), } } self.assertEqual(calculate_next_timestamp(task2, "localhost", tzinfo), parse_timestamp("2018-07-02 00:00 UTC")) # at 00:05 in August task3 = { "id": 3, "active": True, "name": "task two", "interval": "5 0 * 8 *", "last_update": parse_timestamp("2018-06-24 08:00:00 UTC"), "nodes": ["foo", "bar"], "taskmodule": "some.module", "options": {"KEY2": "value number 2", "key 4": "1234"}, "last_runs": { "localhost": parse_timestamp("2017-08-31 00:06 UTC"), } } self.assertEqual(calculate_next_timestamp(task3, "localhost", tzinfo), parse_timestamp("2018-08-01 00:05 UTC")) # malformed task4 = { "id": 3, "active": True, "name": "task two", "interval": "every two days", "last_update": parse_timestamp("2018-06-24 08:00:00 UTC"), "nodes": ["foo", "bar"], "taskmodule": "some.module", "options": {"KEY2": "value number 2", "key 4": "1234"}, "last_runs": { "localhost": parse_timestamp("2017-08-31 00:06 UTC"), } } with self.assertRaises(ValueError): calculate_next_timestamp(task4, "localhost")
def test_02_calculate_next_timestamp_localtime(self): # The harder case: Calculate everything in a local timezone # There is no DST in russia, so we operate in +03:00 tzinfo = gettz("Europe/Moscow") # every day at 08:00 task = { "interval": "0 8 * * *", "last_update": parse_timestamp("2018-06-24 02:30 UTC"), "last_runs": { "foo": parse_timestamp("2018-06-25 05:04:30 UTC"), } } self.assertEqual(calculate_next_timestamp(task, "foo", tzinfo), parse_timestamp("2018-06-26 05:00 UTC")) self.assertEqual(calculate_next_timestamp(task, "bar", tzinfo), parse_timestamp("2018-06-24 05:00 UTC")) self.assertEqual(calculate_next_timestamp(task, "this_node_does_not_exist", tzinfo), parse_timestamp("2018-06-24 05:00 UTC")) # every day at 08:00 task = { "interval": "0 8 * * *", "last_update": parse_timestamp("2018-06-24 02:30 UTC"), "last_runs": { "foo": parse_timestamp("2018-06-25 04:04:30 UTC"), } } self.assertEqual(calculate_next_timestamp(task, "foo", tzinfo), parse_timestamp("2018-06-25 05:00 UTC")) # every day at midnight task = { "interval": "0 0 * * *", "last_update": parse_timestamp("2018-06-24 02:30 UTC"), "last_runs": { "foo": parse_timestamp("2018-06-25 21:01 UTC"), } } self.assertEqual(calculate_next_timestamp(task, "foo", tzinfo), parse_timestamp("2018-06-26 21:00 UTC")) # every wednesday at midnight task = { "interval": "0 0 * * 3", "last_update": parse_timestamp("2018-06-24 02:30 UTC"), "last_runs": { "foo": parse_timestamp("2018-06-24 21:00 UTC"), # this is actually monday 00:00 in russia } } self.assertEqual(calculate_next_timestamp(task, "foo", tzinfo), parse_timestamp("2018-06-26 21:00 UTC")) # every 15th at 01:00 task = { "interval": "0 1 15 * *", "last_update": parse_timestamp("2018-06-24 02:30 UTC"), "last_runs": { "foo": parse_timestamp("2018-05-15 00:00 UTC"), } } self.assertEqual(calculate_next_timestamp(task, "foo", tzinfo), parse_timestamp("2018-06-14 22:00 UTC")) # every 15th at 01:00 task = { "interval": "0 1 15 * *", "last_update": parse_timestamp("2018-06-24 02:30 UTC"), "last_runs": { "foo": parse_timestamp("2018-05-14 21:59 UTC"), } } self.assertEqual(calculate_next_timestamp(task, "foo", tzinfo), parse_timestamp("2018-05-14 22:00 UTC"))
def parse_sitemap(row, seen): if not row or not row["content"]: return [], set() sitemap_url = row["url"] soup = BeautifulSoup(row["content"], "xml") elements = soup.findAll("url") rows = [] for elem in elements: url_node = elem.find("loc") lastmod_node = elem.find("lastmod") xmlmeta = "\n".join([str(e) for e in elem.children]).encode("utf-8") try: lastmod = parse_timestamp(lastmod_node.text.strip()) except: lastmod = NULL_DATE try: url = url_node.text.strip() except: continue if url: url = url.strip() if url in seen: continue row = { "url": url.strip(), "site": row["site"], "name": row["name"], "city": row["city"], "state": row["state"], "loc": row["loc"], "lastmod": lastmod, "xmlmeta": xmlmeta, "is_dumpsterfire": row["is_dumpsterfire"], "selector": row['selector'] } if row["url"] not in seen: rows.append(row) print( blue( json.dumps( row, indent=4, default=lambda x: str(x) if isinstance(x, (bytes, datetime.datetime)) else x, ))) seen.add(url.strip()) if len(rows) > MAX_ARTICLES_PER_SOURCE: break rows = list( sorted(rows, key=lambda row: ensure_tztime(row["lastmod"]), reverse=True)) print( magenta("[ fetch_sitemap ] "), f":: Extracted {len(rows)} urls from sitemap: {sitemap_url}", ) return rows, seen
def list_pages(self, *, chunk=None, chunk_size=None, tags=None, maintainers=None, url=None, title=None, include_versions=None, include_earliest=None, include_latest=None, source_type=None, hash=None, start_date=None, end_date=None, active=None): """ List all Pages, optionally filtered by search criteria. Parameters ---------- chunk : integer, optional pagination parameter chunk_size : integer, optional number of items per chunk tags : list of string, optional maintainers : list of string, optional url : string, optional title : string, optional include_versions : boolean, optional include_earliest : boolean, optional include_latest : boolean, optional source_type : string, optional such as 'versionista' or 'internet_archive' hash : string, optional SHA256 hash of Version content start_date : datetime, optional end_date : datetime, optional active : boolean, optional Returns ------- response : dict """ params = { 'chunk': chunk, 'chunk_size': chunk_size, 'tags[]': tags, 'maintainers[]': maintainers, 'url': url, 'title': title, 'include_versions': include_versions, 'include_earliest': include_earliest, 'include_latest': include_latest, 'source_type': source_type, 'hash': hash, 'capture_time': _time_range_string(start_date, end_date), 'active': active } url = f'{self._api_url}/pages' res = requests.get(url, auth=self._auth, params=params) _process_errors(res) result = res.json() data = result['data'] # In place, replace datetime strings with datetime objects. for page in data: page['created_at'] = parse_timestamp(page['created_at']) page['updated_at'] = parse_timestamp(page['updated_at']) if 'earliest' in page: page['earliest']['capture_time'] = parse_timestamp( page['earliest']['capture_time']) page['earliest']['created_at'] = parse_timestamp( page['earliest']['created_at']) page['earliest']['updated_at'] = parse_timestamp( page['earliest']['updated_at']) if 'latest' in page: page['latest']['capture_time'] = parse_timestamp( page['latest']['capture_time']) page['latest']['created_at'] = parse_timestamp( page['latest']['created_at']) page['latest']['updated_at'] = parse_timestamp( page['latest']['updated_at']) if 'versions' in page: for v in page['versions']: v['created_at'] = parse_timestamp(v['created_at']) v['updated_at'] = parse_timestamp(v['updated_at']) v['capture_time'] = parse_timestamp(v['capture_time']) return result
def list_versions(self, *, page_id=None, chunk=None, chunk_size=None, start_date=None, end_date=None, source_type=None, hash=None, source_metadata=None, different=None, include_change_from_previous=None, include_change_from_earliest=None): """ List Versions, optionally filtered by serach criteria, including Page. Parameters ---------- page_id : string, optional restricts serach to Versions of a specific Page chunk : integer, optional pagination parameter chunk_size : integer, optional number of items per chunk start_date : datetime, optional end_date : datetime, optional source_type : string, optional such as 'versionista' or 'internetarchive' hash : string, optional SHA256 hash of Version content source_metadata : dict, optional Examples: * ``{'version_id': 12345678}`` * ``{'account': 'versionista1', 'has_content': True}`` different : boolean, optional If False, include versions that aren't actually different from the previous version of the same page in the response. include_change_from_previous : boolean, optional If True, include a `change_from_previous` field in each version that represents a change object between it and the previous version of the same page. include_change_from_earliest : boolean, optional If True, include a `change_from_earliest` field in each version that represents a change object between it and the earliest version of the same page. Returns ------- response : dict """ params = { 'chunk': chunk, 'chunk_size': chunk_size, 'capture_time': _time_range_string(start_date, end_date), 'source_type': source_type, 'hash': hash, 'different': different, 'include_change_from_previous': include_change_from_previous, 'include_change_from_earliest': include_change_from_earliest } if source_metadata is not None: for k, v in source_metadata.items(): params[f'source_metadata[{k}]'] = v if page_id is None: url = f'{self._api_url}/versions' else: url = f'{self._api_url}/pages/{page_id}/versions' res = requests.get(url, auth=self._auth, params=params) _process_errors(res) result = res.json() # In place, replace datetime strings with datetime objects. for v in result['data']: v['created_at'] = parse_timestamp(v['created_at']) v['updated_at'] = parse_timestamp(v['updated_at']) v['capture_time'] = parse_timestamp(v['capture_time']) return result