def ext_archive_url(doc_id): """ Try to extract an Internet Archive timestamp from the URL. Args: doc_id (int): The document id. """ doc = Document.get(Document.id==doc_id) match = re.search( 'web\.archive\.org\/web\/(?P<timestamp>\d+)', doc.syllabus.url ) if match: date = datetime.strptime( match.group('timestamp'), date_format ) if date < datetime.now(): return Document_Date_Archive_Url.create( document=doc, date=date )
def test_ignore_regular_url(ext): """ When the syllabus was scraped from a regular URL, don't write a row. """ row = ext('http://yale.edu/syllabus.html') # Shouldn't write a row. assert Document_Date_Archive_Url.select().count() == 0
def test_ignore_future_timestamp(ext): """ Don't index timestamps from the future. """ # Get now + 1 year. future = datetime.now() + relativedelta(years=1) timestamp = future.strftime(date_format) url1 = 'https://web.archive.org/web/'+timestamp url2 = 'http://yale.edu/syllabus.html' row = ext(url1+'/'+url2) # Shouldn't write a row. assert Document_Date_Archive_Url.select().count() == 0