Пример #1
0
def ext_archive_url(doc_id):

    """
    Try to extract an Internet Archive timestamp from the URL.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    match = re.search(
        'web\.archive\.org\/web\/(?P<timestamp>\d+)',
        doc.syllabus.url
    )

    if match:

        date = datetime.strptime(
            match.group('timestamp'),
            date_format
        )

        if date < datetime.now():

            return Document_Date_Archive_Url.create(
                document=doc,
                date=date
            )
Пример #2
0
def test_ignore_regular_url(ext):

    """
    When the syllabus was scraped from a regular URL, don't write a row.
    """

    row = ext('http://yale.edu/syllabus.html')

    # Shouldn't write a row.
    assert Document_Date_Archive_Url.select().count() == 0
Пример #3
0
def test_ignore_future_timestamp(ext):

    """
    Don't index timestamps from the future.
    """

    # Get now + 1 year.
    future = datetime.now() + relativedelta(years=1)
    timestamp = future.strftime(date_format)

    url1 = 'https://web.archive.org/web/'+timestamp
    url2 = 'http://yale.edu/syllabus.html'

    row = ext(url1+'/'+url2)

    # Shouldn't write a row.
    assert Document_Date_Archive_Url.select().count() == 0