def _getNumberSelectionSplittedNumber(self, firstPartNumber, lastPartNumber): escapedFirstPartNumber = helper.escapeForRegex( firstPartNumber) #Not super necessary, but doesn't hurt escapedLastPartNumber = helper.escapeForRegex( lastPartNumber) #Here necessary because of dot #Get all chunks that start with *last* part of number. allSelectionsLastPartNumber = self.cutter.filter( auto_regex='^{}'.format(escapedLastPartNumber) ) # Returns all Selections that have Chunks which start with the number #Sort them from highest to lowest sortedAllSelecionsLastPartNumber = sorted( allSelectionsLastPartNumber, key=lambda x: x.doc_top) #Sort by appearance firstPartNumberSelection = None for selection in sortedAllSelecionsLastPartNumber: #Start with highest selection #All Chunks that are "slightly" (strict) above last part number chunk aboveSelections = self.cutter.all().filter( doc_top__gte=selection.doc_top - 50, doc_bottom__lte=selection.doc_top, ) #Any Chunk slightly above that starts with *first* part of number? maybeFirstPartNumberAboveSelection = aboveSelections.filter( auto_regex='^{}$'.format(escapedFirstPartNumber)) if len(maybeFirstPartNumberAboveSelection ) == 1: #There is exactly one such selection #Therefore. return it (per Definition) as number chunk and stop firstPartNumberSelection = maybeFirstPartNumberAboveSelection break return firstPartNumberSelection
def _getNumberSelection(self, number): numberWithoutPoint = number[:-1] escapedNum = helper.escapeForRegex(numberWithoutPoint) allSelectionsNumber = self.cutter.filter( auto_regex='^{}'.format(escapedNum) ) # Returns all Selections that have Chunks which start with the number return self._getHighestSelection(allSelectionsNumber)
def _getSubpartSelectionNonStrictBelowNumberSelection( self, subpart, numberSelection): escapedSubpart = helper.escapeForRegex(subpart) numberUpperBorder = self.cutter.all().filter( doc_top__gte=numberSelection.doc_top - 50, ) allSelectionsSubpartNonStrictBelowNumber = numberUpperBorder.filter( regex="[^G]" + escapedSubpart ) #Disallow G in Selection for TOP 18. b) because of "(LFGB)" getting attention too (although upper case?) return self._getHighestSelection( allSelectionsSubpartNonStrictBelowNumber)
def _getNumberSelection(self, number): escapedNum = helper.escapeForRegex(number) allSelectionsNumber = self.cutter.filter( auto_regex='^{}'.format(escapedNum) ).filter( # Returns all Selections that have Chunks which start with the number left__lte=self. TOPRight, #Can't do anything if whole line is one chunk (therefore right__lte bad), but it should at least start before TOPRight top__gte=self.page_heading, ) highestSelection = self._getHighestSelection(allSelectionsNumber) #dVis.showCutter(highestSelection) return highestSelection
def _getNumberSelection(self, number): formatedNumber = number.split( "." )[0] #46. -> 46 , Done with split because also use it here in SA for Subparts as well escapedNum = helper.escapeForRegex(formatedNumber) allSelectionsNumber = self.cutter.filter( auto_regex='^{}'.format(escapedNum) ).filter( left__lte= 160 # Dont match e.g. "980 Sitzung" in title for TOP 9 (happens in 989 TOP 9) ) return self._getHighestSelection(allSelectionsNumber)
def _getSubpartSelectionNonStrictBelowNumberSelection( self, subpart, numberSelection): escapedSubpart = helper.escapeForRegex(subpart) numberUpperBorder = self.cutter.all().filter( doc_top__gte=numberSelection.doc_top - 50, #Return all Chunks below given numer chunk and the number chunk itself. subpart chunk could be same as number chunk ) # INFO a) for 1. a) NS 970 in same chunk, for 34. a) not # All Chunks non-strict below number chunk that contain given subpart allSelectionsSubpartNonStrictBelowNumber = numberUpperBorder.filter( auto_regex=escapedSubpart ).filter( #46. b) -> b\) because of regex brackets left__lte=self. TOPRight, #Can't do anything if whole line is one chunk (therefore right__lte bad), but it should at least start before TOPRight top__gte=self.page_heading, ) #Return highest of these #INFO adding number chunk as upperbound can break this when subpart chunk == number chunk return self._getHighestSelection( allSelectionsSubpartNonStrictBelowNumber)
def _getPrefixStringSelection(self, s): escapedS = helper.escapeForRegex(s) allSelectionsS = self.cutter.filter( auto_regex='{}'.format(escapedS) ) # Returns all Selections that have Chunks which *contain* s return self._getHighestSelection(allSelectionsS)