def ext_semester(doc_id): """ Try to find a "Spring/Fall YY/YYY" pattern. Args: doc_id (int): The document id. """ doc_text = Document_Text.get(Document_Text.document==doc_id) pattern = re.compile(r''' (?P<semester>fall|autumn|winter|spring|summer) [\s\']+ (?P<year>\d{4}|\d{2}) ''', re.I+re.X) match = re.search(pattern, doc_text.text) if match: row = Document_Date_Semester( document=doc_id, offset=match.start(), semester=match.group('semester'), year=match.group('year') ) if row.date.year > 1980 and row.date < datetime.now(): row.save() return row
def test_text_extraction_succeeds(models, mock_osp): """ read_text() should extract text for a document and write the result into the `document_text` table. """ # Add a file, create a document row. path = mock_osp.add_file(content="text") document = Document.create(path=path) ext_text(document.id) # Pop out the new row. row = Document_Text.get(Document_Text.document == document) assert row.text == "text"