def download(self): """Download the PDF and save it to the database""" if not self.link: raise NoLinkException("No link attribute on this paper.") if not self.PAPER_DIR: raise RuntimeError("Paper.PAPER_DIR is not set.") self.pdf = PaperPDF(paper_id=self.id) # Download it self.pdf.download(Paper.PAPER_DIR) # Save it's download location to the database session = Session.object_session(self) session.add(self.pdf) session.commit()
class Paper(Base): PAPER_DIR = None # ORM __tablename__ = "paper" id = Column(Integer, primary_key=True) module_id = Column(Integer, ForeignKey("module.id")) name = Column(String) period = Column(String) sitting = Column(Integer) year_start = Column(Integer) year_stop = Column(Integer) pdf = relationship("PaperPDF", backref="paper", uselist=False) link = Column(String) questions = relationship("Question", backref="paper") raw_contents = Column(Text) indexed = Column(Boolean) indexed_at = Column(DateTime) parseable = Column(Boolean) # Order by paper period order_by_period = sqlalchemy.sql.expression.case( ((period == "Winter", 1), (period == "Summer", 2), (period == "Autumn", 3), (period == "Spring", 4)) ) def __init__(self, module, name, period, sitting, year_start, year_stop, link): """The Paper class describes a Exam paper. Here we parse questions and store them in a neat array. """ self.module = module self.name = name self.period = period self.sitting = sitting self.year_start = year_start self.year_stop = year_stop self.link = link def index(self): """Parse the paper's questions. This function takes in the parsed PDF document, which is an list of strings, one for each page. We assume the document follows the Paper Specification in notebooks/Exam Paper Feature Exraction.ipynb. Args: document (List[str]): The list of pages. """ logging.info("Indexing paper %r" % self) self.indexed = True self.indexed_at = datetime.now() try: if not self.pdf: # Looks like we have no PDF associated with this paper # Download it. self.download() try: # Get the contents of the PDF pages = [unicode(page, errors='replace') for page in self.pdf.get_contents()] except: # Catch any slate.PDF exceptions and convert them to Unparsable raise UnparseableException() # Save the raw content self.raw_contents = ''.join(pages) # Parse the pages contents # TODO: Parse first page. # TODO: Discuss: should we insert questions into the database without content (i.e. just keep the indices)? self.questions = Paper.parse_pages(pages[1:]) except: exc_class, exc, tb = sys.exc_info() # Save the indexing information self.save() # Re-raise raise exc_class, exc, tb # If it got this far, it means no UnparseableException has been # raised and we've parsed the paper! self.parseable = True self.save() def save(self): """Save self to the database""" session = Session.object_session(self) session.add(self) session.commit() def download(self): """Download the PDF and save it to the database""" if not self.link: raise NoLinkException("No link attribute on this paper.") if not self.PAPER_DIR: raise RuntimeError("Paper.PAPER_DIR is not set.") self.pdf = PaperPDF(paper_id=self.id) # Download it self.pdf.download(Paper.PAPER_DIR) # Save it's download location to the database session = Session.object_session(self) session.add(self.pdf) session.commit() def __repr__(self): return "<Paper(id={id}, {module}, {year_start}/{year_stop}, {period}, {sitting}, link={link}{indexed})>".format( id=self.id, module=self.module.code, year_start=self.year_start, year_stop=self.year_stop, sitting=self.sitting, period=self.period, link=(self.link != None), indexed=(", indexed" if self.is_indexed() else "")) def get_question(self, *path): """Get a questions contents from a paper. If none available, return the nearest estimate to question path. All this function really does is smartly traverse the document tree ignoring sections and such. Args: **args: The path to the question (Int..) Example: To get to question 1, (a), ii. you need to convert the indexes to their integer form i.e. 1 = 1, (a) = 1, ii = 2 then pass these ints as arguments. paper.get_question(1, 1, 2) Returns: str: The content of the question """ for question in self.questions: if question.path == path: return question def get_root_questions(self): return filter(lambda q: len(q.path) == 1, self.questions) def is_indexed(self): """Return whether a paper is indexed or not.""" return self.indexed def to_dict(self): return { 'id': self.id, 'years': [self.year_start, self.year_stop], 'name': self.name, 'sitting': self.sitting, 'period': self.period } def get_link(self, module, format=None): return "/paper/{}/{}/{}{}".format( module.code, self.year_start, self.period.lower(), "." + format if format else "") def get_status(self): if not self.link: return "unavailable" elif not self.indexed: return "unindexed" elif self.indexed and not self.parseable: return "unparseable" elif self.indexed and self.parseable: return "available" @property def short_period(self): return self.period[:3] @property def year(self): return self.year_start ###################################### # Paper parser. ###################################### # Let's define our parser _ = (pp.White(exact=1) | pp.LineEnd() | pp.LineStart()).suppress() __ = pp.White().suppress() ___ = pp.Optional(__) ____ = pp.Optional(_) # Define tokens for numerical, alpha and roman indices # Max two digits for numerical indices because lecturers aren't psychopaths index_digit = pp.Word(pp.nums, max=2).setParseAction(lambda s, l, t: [Index("decimal", int(t[0]))])("[0-9]") index_alpha = pp.Word(pp.alphas, exact=1).setParseAction(lambda s, l, t: [Index("alpha", t[0])])("[a-z]") index_roman = pp.Word("ivx").setParseAction(lambda s, l, t: [Index("roman", t[0])])("[ivx]") # We only support 1-100 roman numerals index_type = (index_digit | index_roman | index_alpha)("index") # Define token for ("Question" / "Q") + "." question = (pp.CaselessLiteral("Question") + pp.Optional("."))("question") # Define tokens for formatted indices e.g [a], (1), ii. etc. index_dotted = (index_type + pp.Literal(".").suppress()).setParseAction(lambda s, l, t: t[0].setNotation("dotted")) index_round_brackets = (pp.Optional(pp.Literal("(")).suppress() + index_type + pp.Literal(")").suppress()).setParseAction(lambda s, l, t: t[0].setNotation("round")) index_square_brackets = (pp.Literal("[").suppress() + index_type + pp.Literal("]").suppress()).setParseAction(lambda s, l, t: t[0].setNotation("square")) index_question = (pp.Word("qQ", exact=1).suppress() + pp.Optional(".").suppress() + index_type + pp.Optional(".").suppress()).setParseAction(lambda s, l, t: t[0].setNotation("question")) # Define final index token with optional Question token before formatted index qindex = ( # Whitespace is required before each index (e.g. "hello world." the d. would be take for an index) _ + \ # Optional "Question." before pp.Optional(question + ___).suppress() + \ # The index (index_question | index_dotted | index_round_brackets | index_square_brackets) + \ # Required whitespace *after* index _ ) # Define a section header section = (pp.CaselessKeyword("Section").suppress() + __ + index_type + _).setParseAction( lambda s, l, t: [t[0].section()] )("section") # Entry point for the parser entry = section ^ qindex @staticmethod def parse_pages(pages): """Parse a page in a paper. We have a pretty complicated sorting algorithm Args: page (str): A page within the paper. """ # If were passed in a list of pages, join them together if isinstance(pages, list): pages = ' '.join([page for page in pages]) logging.info("Parsing exam paper question pages.") logging.info(pages) index_stack = [] # The stack that holds the current index path question_stack = [] # The stack that holds the current index path questions = [] question, last_question, marker = None, None, 0 # Loop over every token we've parsed from the pages for token, start, end in Paper.entry.leaveWhitespace().scanString(pages, overlap=True): # Tiny function to push the current question onto the stack def push(): index_stack.append(index) question = Question(index_stack) if question_stack: question_stack[-1].children.append(question) question_stack.append(question) questions.append(question) return question def pop(): index_stack.pop() question_stack.pop() index = token[0] # The incoming index logging.info("0. Handling index %r" % index) # If the container is the paper, just push the question if len(index_stack) == 0: logging.info("1. Pushing top level index %r." % index) question = push() continue last_index = index_stack[-1] # The last index is the last item in the stack if index.isSimilar(last_index): logging.info("1.1 Similiar indexes %s and previous %s." % (index.index_type, last_index.index_type)) if last_index.isNext(index): logging.info("1.1.1 Pushing index with same type as last index and in sequence.") pop() question = push() else: logging.info("1.1.2 Question with similar indexes but not in sequence, ignoring.") continue else: logging.info("1.2 Dissimilar indexes %s and previous %s." % (index.index_type, last_index.index_type)) parent_index, n = None, 0 # Go through the stack and find the similar index for i, idx in reversed(list(enumerate(index_stack))): if idx.isSimilar(index): parent_index, n = idx, i break # We need to traverse the stack and see if we can find a similar index if parent_index: logging.info("1.2.1 Index similar to parent index %d up the stack [%r]" % (n, parent_index)) # If we have found a similar index and they're in sequence, add the question after # the found container. if parent_index.isNext(index): logging.info("1.2.1.1 Index in sequence, pushing into stack.") index_stack = index_stack[:n] question_stack = question_stack[:n] question = push() else: logging.info("1.2.1.2 Index not in sequence, ignoring") continue # If we encounter a new type of index and it's not the start of a new list, we # can just discard it (it's probably marks). However if the previous index is # a section, we can just continue. elif index.i == 1 or last_index.is_section: logging.info("1.2.2 Pushing new question %r." % index) question = push() else: logging.info("1.2.3 New index value not first in sequence, ignoring.") continue # Save the text if last_question != None: last_question.set_content(None, pages[marker:start]) last_question = None marker = end elif marker == 0: marker = end last_question = question # Squeeze out that last part if last_question: last_question.set_content(None, pages[marker:]) # Test to see if we have any data returned. For now, # we'll assume it's "unparsable" if not content if found. if not questions: raise UnparseableException() return questions ##################################### # SQL methods ##################################### @staticmethod def getById(session, id): return session.query(Paper).filter(Paper.id == id).one() @staticmethod def find(session, module, year, period): if not period.lower() in ["summer", "autumn", "winter", "sprint"]: raise InvalidInput("paper", "Invalid period '%s'." % period) try: return session.query(Paper).filter( (Paper.module_id == module.id) & \ (Paper.year_start == int(year)) & \ (Paper.period == (period[0].upper() + period[1:].lower())) ).one() except NoResultFound: raise NotFound("paper", "Paper %s %s not found." % (period, year))