def _normalize_edtf(s): if s and s != 'u': try: return parse_edtf(s) except Exception: pass # when all else fails, return the "most unknown" EDTF. return parse_edtf('uuuu')
def is_parseable(val): try: if type(val) != str: raise TypeError( "Values passed to expect_column_values_to_be_edtf_parseable must be of type string.\nIf you want to validate a column of dates or timestamps, please call the expectation before converting from string format." ) parse_edtf(val) return True except (ValueError, OverflowError): return False
def parse_edtf_level0(edtfstr): """Parse EDTF input string.""" try: return parse_edtf(edtfstr) except ParseException: raise EDTFValueError( "The string is not a valid EDTF-formatted string.")
def convertEDTFdate(date): try: d = parse_edtf(downgradeEDTF(date)) except: raise ValueError('Invalid date', date) if 'Interval' in str(type(d)): if type(d.lower) is list: lower = d.lower[0].lower_strict() else: lower = d.lower.lower_strict() if type(d.upper) is list: upper = d.upper[0].upper_strict() else: upper = d.upper.upper_strict() else: if type(d) is list: lower = d[0].lower_strict() upper = d[0].upper_strict() else: lower = d.lower_strict() upper = d.upper_strict() return { 'lower': time.strftime("%Y-%m-%d", lower), 'upper': time.strftime("%Y-%m-%d", upper) }
def parse(self, date=None): if date is None: return None self.edtf = None self.orig_date = date self.result_set = None self.error = None try: # handle for incorrectly formatted year only dates # (eg: 290 => 0290, 11909 => y11909) if int(date) >= 0: date = str(int(date)).zfill(4) else: date = str(int(date)).zfill(5) if len(str(abs(int(date)))) > 4 and int(date) != 0: date = 'y' + date except Exception: pass self.edtf = parse_edtf(date) result = self.handle_object(self.edtf) if isinstance(result, list): self.result_set = result else: self.lower = result.lower self.upper = result.upper self.lower_fuzzy = result.lower_fuzzy self.upper_fuzzy = result.upper_fuzzy
def get_date_radical_from_gregorian(date_display: str) -> Optional[str]: if not date_display: return None try: date_edtf = parse_edtf(date_display) if not date_edtf: return None date_fr = french_republican.from_gregorian( *date_edtf.lower_strict()[:3]) date_str = (f"{date_fr[2]} " f"{french_republican.MOIS[date_fr[1] - 1].lower()} " f"an {int(date_fr[0])}") if date_edtf.lower_strict() != date_edtf.upper_strict(): date_fr = french_republican.from_gregorian( *date_edtf.upper_strict()[:3]) date_str = (f"{date_str} - " f"{date_fr[2]} " f"{french_republican.MOIS[date_fr[1] - 1].lower()} " f"an {int(date_fr[0])}") return date_str except EDTFParseException: return None
def parse(self, date=None): if date == None: return None self.edtf = None self.orig_date = date self.result_set = None self.error = None try: # handle for incorrectly formatted year only dates # (eg: 290 => 0290, 11909 => y11909) if int(date) >= 0: date = str(int(date)).zfill(4) else: date = str(int(date)).zfill(5) if len(str(abs(int(date)))) > 4 and int(date) != 0: date = 'y' + date except: pass self.edtf = parse_edtf(date) result = self.handle_object(self.edtf) if isinstance(result, list): self.result_set = result else: self.lower = result.lower self.upper = result.upper self.lower_fuzzy = result.lower_fuzzy self.upper_fuzzy = result.upper_fuzzy
def clean(self): super().clean() # Per Django docs: validate and modify values in Model.clean() # https://docs.djangoproject.com/en/3.1/ref/models/instances/#django.db.models.Model.clean # Check that nat_lang_edtf_string and edtf_string are either both set, or both unset if (self.nat_lang_edtf_string and not self.edtf_string) or (not self.nat_lang_edtf_string and self.edtf_string): raise ValidationError( 'If setting a date on a composition, an EDTF string and a natural language EDTF string must be provided.' ) # Validate edtf_string if self.edtf_string and self.nat_lang_edtf_string: try: e = parse_edtf(self.edtf_string) except EDTFParseException: raise ValidationError({ 'edtf_string': '{} is not a valid EDTF string'.format(self.edtf_string) }) self.lower_fuzzy = struct_time_to_date(e.lower_fuzzy()) self.upper_fuzzy = struct_time_to_date(e.upper_fuzzy()) self.lower_strict = struct_time_to_date(e.lower_strict()) self.upper_strict = struct_time_to_date(e.upper_strict()) if self.lower_strict.year != self.upper_strict.year: self.nat_lang_year = '{}-{}'.format(self.lower_strict.year, self.upper_strict.year) else: self.nat_lang_year = str(self.lower_strict.year)
def to_python(self, value): if isinstance(value, EDTFObject): return value if value is None: return value return parse_edtf(value, fail_silently=True)
def _normalize_edtf(s): if s and s != 'u' and s != 'uuuu': try: return parse_edtf(s) except Exception: pass # when all else fails, return the "most unknown" EDTF. return MOST_UNKNOWN_EDTF
def from_db_value(self, value, expression, connection, context): # Converting values to Python objects if not value: return None try: return pickle.loads(str(value)) except: pass return parse_edtf(value, fail_silently=True)
def get_publication_year(self, obj): """Get publication year from edtf date.""" try: publication_date = obj["metadata"]["publication_date"] parsed_date = parse_edtf(publication_date) return str(parsed_date.lower_strict().tm_year) except ParseException: # NOTE: Should not fail since it was validated at service schema raise ValidationError( "Unable to parse publicationYear from publication_date")
def get_publication_year(self, obj): """Get publication year from edtf date.""" try: publication_date = obj["metadata"]["publication_date"] parsed_date = parse_edtf(publication_date) return str(parsed_date.lower_strict().tm_year) except ParseException: # Should not fail since it was validated at service schema current_app.logger.error("Error parsing publication_date field for" f"record {obj['metadata']}") raise ValidationError(_("Invalid publication date value."))
def pre_save(self, instance, add): """ Updates the edtf value from the value of the display_field. If there's a valid edtf, then set the date values. """ if not self.natural_text_field or self.attname not in instance.__dict__: return edtf = getattr(instance, self.attname) # Update EDTF field based on latest natural text value, if any natural_text = getattr(instance, self.natural_text_field) if natural_text: edtf = text_to_edtf(natural_text) else: edtf = None # TODO If `natural_text_field` becomes cleared the derived EDTF field # value should also be cleared, rather than left at original value? # TODO Handle case where EDTF field is set to a string directly, not # via `natural_text_field` (this is a slightly unexpected use-case, but # is a very efficient way to set EDTF values in situations like for API # imports so we probably want to continue to support it?) if edtf and not isinstance(edtf, EDTFObject): edtf = parse_edtf(edtf, fail_silently=True) setattr(instance, self.attname, edtf) # set or clear related date fields on the instance for attr in DATE_ATTRS: field_attr = "%s_field" % attr g = getattr(self, field_attr, None) if g: if edtf: try: target_field = instance._meta.get_field(g) except FieldDoesNotExist: continue value = getattr(edtf, attr)() # struct_time if isinstance(target_field, models.FloatField): value = struct_time_to_jd(value) elif isinstance(target_field, models.DateField): value = struct_time_to_date(value) else: raise NotImplementedError( u"EDTFField does not support %s as a derived data" u" field, only FloatField or DateField" % type(target_field)) setattr(instance, g, value) else: setattr(instance, g, None) return edtf
def dump(self, record, data): """Dump the data.""" try: parent_data = dict_lookup(data, self.keys, parent=True) pd = parse_edtf(parent_data[self.key]) parent_data[f"{self.key}_start"] = date.fromtimestamp( calendar.timegm(pd.lower_strict())).isoformat() parent_data[f"{self.key}_end"] = date.fromtimestamp( calendar.timegm(pd.upper_strict())).isoformat() except (KeyError, EDTFParseException): # The field does not exists or had wrong data return data # FIXME: should log this in debug mode?
def dump(self, record, data): """Dump the data.""" try: parent_data = dict_lookup(data, self.keys, parent=True) pd = parse_edtf(parent_data[self.key]) parent_data[self.range_key] = { "gte": _format_date(pd.lower_strict()), "lte": _format_date(pd.upper_strict()), } except (KeyError, EDTFParseException): # The field does not exists or had wrong data return data # FIXME: should log this in debug mode?
def date_is_absolute(self): try: self.date = parse_edtf(self.input) except EDTFParseException: try: pd.parse(self.input) except pd.parsing.exceptions.ParserError: return False else: raise CommandError("Absolute dates must be of the format " "YYYY, YYYY-MM or YYYY-MM-DD") else: return True
def date_to_int(val): try: date = parse_edtf(val) ## if there's a problem parsing, try this as a long year except EDTFParseException: date = parse_edtf("y{}".format(val)) # if it's a real DateAndTime (from a date node), must parse it further if isinstance(date, DateAndTime): date = parse_edtf(str(date.date)) y = int(date.year) * 10000 if isinstance(date, LongYear): md = "0000" else: m = int(date.month) if date.month else 0 d = int(date.day) if date.day else 0 md = str(m).zfill(2) + str(d).zfill(2) dateint = y + int(md) return dateint
def post_shows(api, scroll): shows = [] url = 'https://studsterkel.wfmt.com/explore#t=date' soup = get_url_as_soup(url) ps = soup.find_all('p') for p in ps: show = {} a = p.find('a') if a is not None: date = a.find('span') if date is not None: # Evil python mutates `a` object [s.extract() for s in a('span')] _edtf = parse_edtf(text_to_edtf(date.text)) title = a.text.strip() person = get_person(title) thumb = None show = { 'when_happened': struct_time_to_datetime(_edtf.upper_strict()), 'resolution': len(str(_edtf)), 'when_original': date.text, 'content_url': 'https://studsterkel.wfmt.com{}'.format(a.get('href')), 'title': a.text.strip(), 'text': '', 'with_thumbnail': thumb, 'media_type': 'audio/mpeg', 'content_type': 'Oral histories', 'source_url': 'https://studsterkel.wfmt.com/', 'with_thumbnail': api.cache_wiki_thumbnail(person) } resp = api.create_event(show, scroll) pprint(resp.json())
def dump(self, record, data): """Dump the data.""" try: date_list = dict_lookup(data, self.keys, parent=False) # EDTF parse_edtf (using pyparsing) expects a string for item in date_list: pd = parse_edtf(item[self.key]) item[self.range_key] = { "gte": _format_date(pd.lower_strict()), "lte": _format_date(pd.upper_strict()), } except (KeyError, EDTFParseException): # The field does not exists or had wrong data return data # FIXME: should log this in debug mode?
def add_first_link(w): if w is not None and 'event' in w: e = w['event'] for tag in e(['sup', 'span']): tag.decompose() text = e.text.rstrip() text = re.sub('^\s*\d+:\s*', '', text) text = re.sub('^:\s*', '', text) if w['context'] is not None and w['context'] != '': text = '{}: {}'.format(w['context'], e.text.rstrip()) m = re.match('^(.+[a-z]{2,}\.\s+)(.*)', text) if m is not None: title = m.group(1) text = m.group(2) else: title = text text = '' w['title'] = title w['text'] = text links = e.select('a') if len(links) > 0 and links[0] is not None: href = links[0].get('href') w['content_url'] = 'https://en.wikipedia.org{}'.format(href, ) w['item'] = re.sub(r'/wiki/|/w/index.php\?title\=', '', href) date_text = '{} {}'.format(w['date'], w['year']) date_text = re.sub('–', '-', date_text) try: edtf_date_txt = text_to_edtf(date_text) edtf_date = parse_edtf(edtf_date_txt) iso_date = time.strftime('%Y-%m-%dT%H:%M:%SZ', edtf_date.upper_fuzzy()) w['when_happened'] = iso_date w['when_original'] = date_text w['resolution'] = 10 del w['event'] del w['date'] del w['context'] if 'header' in w: del w['header'] return w except Exception: pass
def get_issued(self, obj): """Get issued dates.""" date_parts = [] publication_date = obj["metadata"]["publication_date"].split("/") for date in publication_date: p_date = parse_edtf(date) date_part = [] year, month, day = p_date.year, p_date.month, p_date.day if year: date_part.append(year) if month: date_part.append(month) if day: date_part.append(day) date_parts.append(date_part) return {"date-parts": date_parts}
def clean(self): try: e = parse_edtf(self.edtf_string) except EDTFParseException: raise ValidationError({ 'edtf_string': '{} is not a valid EDTF string'.format(self.edtf_string) }) self.lower_fuzzy = struct_time_to_date(e.lower_fuzzy()) self.upper_fuzzy = struct_time_to_date(e.upper_fuzzy()) self.lower_strict = struct_time_to_date(e.lower_strict()) self.upper_strict = struct_time_to_date(e.upper_strict()) if self.lower_strict.year != self.upper_strict.year: self.nat_lang_year = '{}-{}'.format(self.lower_strict.year, self.upper_strict.year) else: self.nat_lang_year = str(self.lower_strict.year)
def save(self, *args, **kwargs): try: e = parse_edtf(self.edtf_string) except EDTFParseException: raise ValidationError('{} is not a valid EDTF string'.format( self.edtf_string)) self.lower_fuzzy = struct_time_to_date(e.lower_fuzzy()) self.upper_fuzzy = struct_time_to_date(e.upper_fuzzy()) self.lower_strict = struct_time_to_date(e.lower_strict()) self.upper_strict = struct_time_to_date(e.upper_strict()) if self.lower_strict.year != self.upper_strict.year: self.nat_lang_year = '{}-{}'.format(self.lower_strict.year, self.upper_strict.year) else: self.nat_lang_year = str(self.lower_strict.year) super().save(*args, **kwargs)
def __call__(self, value): """Validate.""" try: e = parse_edtf(value) except ParseException: raise ValidationError(self._format_error(value, None)) if self._types: if not any([isinstance(e, t) for t in self._types]): raise ValidationError(self._format_error(value, e)) if self._chronological_interval: # We require intervals to be chronological. EDTF Date and Interval # both have same interface and # date.lower_strict() <= date.upper_strict() is always True for a # Date if e.upper_strict() < e.lower_strict(): raise ValidationError(self._format_error(value, e)) return value
def get_issued(self, obj): """Get issued dates.""" try: parsed = parse_edtf(obj["metadata"].get("publication_date")) except EDTFParseException: return missing if isinstance(parsed, Date): parts = add_if_not_none(parsed.year, parsed.month, parsed.day) return {"date-parts": [parts]} elif isinstance(parsed, Interval): d1 = parsed.lower d2 = parsed.upper return { "date-parts": [ add_if_not_none(d1.year, d1.month, d1.day), add_if_not_none(d2.year, d2.month, d2.day), ] } else: return missing
def parse_date(self) -> Optional[struct_time]: try: return parse_edtf(self.date_display) except (AttributeError, EDTFParseException): return None
# because the meta csv file didn't have it set if we're trying # to fetch the raw json in the first place. But this is meant # to catch this scenario. self.wof_id = wof_id self.reason = reason self.message = message self.halt = halt self.skipped = skipped self.funky = funky self.superseded = superseded # keep this as a constant - it actually take a significant amount of time to # re-parse this every time, when we know it's a constant. MOST_UNKNOWN_EDTF = parse_edtf('uuuu') # given a string, parse it as EDTF while allowing a single 'u', four u's # 'uuuu', or None to mean completely unknown, and return the EDTF object. def _normalize_edtf(s): if s and s != 'u' and s != 'uuuu': try: return parse_edtf(s) except Exception: pass # when all else fails, return the "most unknown" EDTF. return MOST_UNKNOWN_EDTF
def prepare_feature(self, f, **kwargs): props = f['properties'] props['wof:geomhash'] = u.hash_geom(f) # who am I ? # have I been here before ? # why is the sky blue ? # also, what time is it? now = int(time.time()) wofid = None if props.has_key('wof:id'): wofid = props['wof:id'] if wofid == None: logging.debug( "This record has no wofid so now asking what Brooklyn would do..." ) wofid = u.generate_id() if wofid == 0: logging.error("OH NO - can't get integer!") return False props['wof:id'] = wofid props['wof:created'] = now f['id'] = props['wof:id'] props['wof:lastmodified'] = now # TO DO: FIGURE OUT HOW TO DERIVE DEFAULTS FROM # py-mapzen-whosonfirst-validator (20150922/thisisaaronland) # stubs for k in ('supersedes', 'superseded_by', 'hierarchy', 'belongsto', 'breaches'): k = "wof:%s" % k if not props.get(k, False): props[k] = [] # ensure 'mz:' properties # https://github.com/whosonfirst/whosonfirst-data/issues/320 if props.get('mz:hierarchy_label', None) == None: props['mz:hierarchy_label'] = 1 is_current = props.get("mz:is_current", None) if not is_current in (-1, 0, 1): if str(is_current) == "-1": is_current = -1 elif str(is_current) == "0": is_current = 0 elif str(is_current) == "1": is_current = 1 else: is_current = -1 props['mz:is_current'] = is_current # ensure 'wof:repo' # https://github.com/whosonfirst/whosonfirst-data/issues/338 if props.get('wof:repo', None) == None: data_root = self.root repo_root = os.path.dirname(data_root) props['wof:repo'] = os.path.basename(repo_root) # ensure edtf stuff - it might be time for py-whosonfirst-dates/edtf package # but not today... (20180503/thisisaaronland) for k in ('inception', 'cessation'): k = "edtf:%s" % k # section 5.2.2 (EDTF) - this appears to have changed to 'XXXX' as of # the draft sent to ISO (201602) but we're just going to wait... if not props.has_key(k): props[k] = u"uuuu" # my bad - just adding it here in advance of a proper # backfill (20160107/thisisaaronland) if props.get(k) == "u": props[k] = u"uuuu" # now we try to append upper/lower ranges for inception and cessation # dates - specifically plain vanilla YMD values that can be indexed by # plain old databases (20180503/thisisaaronland) # note the use of arrow (.py) since datetime.strptime can't deal with # parsing YYYY-MM-DD dates before 1900 because... I mean really, who # cares why it's just kind of... bad (20180503/thisisaaronland) inception = props.get("edtf:inception", "") cessation = props.get("edtf:cessation", "") fmt = "YYYY-MM-DD" # skip "uuuu" because it resolves to 0001-01-01 9999-12-31 (in edtf.py land) if not inception in ("", "uuuu"): try: e = edtf.parse_edtf(unicode(inception)) lower = arrow.get(e.lower_strict()) upper = arrow.get(e.upper_strict()) props["date:inception_lower"] = lower.format(fmt) props["date:inception_upper"] = upper.format(fmt) except Exception, e: logging.warning("Failed to parse inception '%s' because %s" % (inception, e)) if not cessation in ("", "uuuu", "open"): # we'll never get here because of the test above but the point # is a) edtf.py freaks out when an edtf string is just "open" (not # sure if this is a me-thing or a them-thing and b) edtf.py interprets # "open" as "today" which is not what we want to store in the database # (20180418/thisisaaronland) if cessation == "open" and not inception in ("", "uuuu"): cessation = "%s/open" % inception try: e = edtf.parse_edtf(unicode(cessation)) lower = arrow.get(e.lower_strict()) upper = arrow.get(e.upper_strict()) props["date:cessation_lower"] = lower.format(fmt) props["date:cessation_upper"] = upper.format(fmt) except Exception, e: logging.warning( "Failed to parse cessation '%s' because %s" % (cessation, e))
def _as_edtf_object(cls, edtf_format): try: return parse_edtf(edtf_format) except EDTFParseException: return None