Пример #1
0
 def _single_length_title(self, table, row_index, current_col):
     '''
     Returns true if the row is a single length title element with no other row titles. Useful
     for tracking pre-data titles that belong in their own block.
     '''
     if len(table[row_index]) - current_col <= 0:
         return False
     return (is_text_cell(table[row_index][current_col]) and all(
         not is_text_cell(table[row_index][next_column])
         for next_column in xrange(current_col + 1, len(table[row_index]))))
Пример #2
0
 def _single_length_title(self, table, row_index, current_col):
     '''
     Returns true if the row is a single length title element with no other row titles. Useful
     for tracking pre-data titles that belong in their own block.
     '''
     if len(table[row_index]) - current_col <= 0:
         return False
     return (is_text_cell(table[row_index][current_col]) and
             all(not is_text_cell(table[row_index][next_column])
                 for next_column in xrange(current_col + 1, len(table[row_index]))))
Пример #3
0
 def _find_block_start(self, table, used_cells, possible_block_start, 
                       start_pos, end_pos):
     '''
     Finds the start of a block from a suggested start location.
     This location can be at a lower column but not a lower row.
     
     Note this also finds the lowest row of block_end.
     '''
     current_col = possible_block_start[1]
     block_start = list(possible_block_start)
     block_end = list(possible_block_start)
     repeat = True
     checked_all = False
     # Repeat until we've met satisfactory conditions for 
     # catching all edge cases or we've checked all valid
     # block locations
     while(not checked_all and repeat):
         block_end[0] = max(block_end[0], possible_block_start[0])
         block_end[1] = max(block_end[1], current_col)
         table_column = TableTranspose(table)[current_col]
         used_column = TableTranspose(used_cells)[current_col]
         # We need to find a non empty cell before we can stop
         blank_start = is_empty_cell(table_column[possible_block_start[0]])
         # Unless we have assume_complete_blocks set to True
         if blank_start and self.assume_complete_blocks:
             # Found a blank? We're done
             repeat = False
             break
         blank_exited = not blank_start
         blank_repeat_threshold = 3
         parent_title = blank_start or is_text_cell(table_column[possible_block_start[0]])
         #TODO refactor code below into new function for easier reading
         # Analyze the beginning columns
         for row_index in range(possible_block_start[0], end_pos[0]+1):
             # Ensure we catch the edge case of the data reaching the edge of
             # the table -- block_end should then equal end_pos
             if blank_exited:
                 block_end[0] = max(block_end[0], row_index)
             if row_index == end_pos[0] or used_column[row_index]:
                 # We've gone through the whole range
                 checked_all = True
                 break
             elif not blank_exited:
                 blank_exited = not is_empty_cell(table_column[row_index])
             elif is_empty_cell(table_column[row_index]):
                 current_col += 1
                 break
             else:
                 # Go find the left most column that's still valid
                 table_row = table[row_index]
                 used_row = used_cells[row_index]
                 for column_index in range(current_col, start_pos[1]-1, -1):
                     if is_empty_cell(table_row[column_index]) or used_row[column_index]:
                         break
                     else:
                         block_start[1] = min(block_start[1], column_index)
             # Check if we've seen few enough cells to guess that we have a repeating title
             repeat = blank_start or 1+row_index-possible_block_start[0] <= blank_repeat_threshold
         
     return block_start, block_end
Пример #4
0
 def _fill_column_holes(self):
     '''
     Same as _fill_row_holes but for columns.
     '''
     for column_index in range(self.start[1], self.end[1]):
         table_column = TableTranspose(self.table)[column_index]
         column_start = table_column[self.start[0]]
         if is_text_cell(column_start):
             self._check_fill_title_column(column_index)
Пример #5
0
 def _fill_column_holes(self):
     '''
     Same as _fill_row_holes but for columns.
     '''
     for column_index in range(self.start[1], self.end[1]):
         table_column = TableTranspose(self.table)[column_index]
         column_start = table_column[self.start[0]]
         if is_text_cell(column_start):
             self._check_fill_title_column(column_index)
Пример #6
0
 def _fill_row_holes(self):
     '''
     Fill any remaining row title cells that are empty. This must be done after the other passes
     to avoid preemptively filling in empty cells reserved for other operations.
     '''
     for row_index in range(self.start[0], self.max_title_row):
         table_row = self.table[row_index]
         row_start = table_row[self.start[1]]
         if is_text_cell(row_start):
             self._check_fill_title_row(row_index)
Пример #7
0
 def _fill_row_holes(self):
     '''
     Fill any remaining row title cells that are empty. This must be done after the other passes
     to avoid preemptively filling in empty cells reserved for other operations.
     '''
     for row_index in range(self.start[0], self.max_title_row):
         table_row = self.table[row_index]
         row_start = table_row[self.start[1]]
         if is_text_cell(row_start):
             self._check_fill_title_row(row_index)
Пример #8
0
 def _fill_row_holes(self):
     '''
     Fill any remaining row title cells that are empty.
     This must be done after the other passes to avoid 
     preemptively filling in empty cells reserved for 
     other operations.
     
     TODO potential optimization, mark rows/columns that
     were already processed and skip them here.
     '''
     for row_index in range(self.start[0], self.end[0]):
         table_row = self.table[row_index]
         row_start = table_row[self.start[1]]
         if is_text_cell(row_start):
             self._check_fill_title_row(row_index)
Пример #9
0
 def _fill_row_holes(self):
     '''
     Fill any remaining row title cells that are empty.
     This must be done after the other passes to avoid 
     preemptively filling in empty cells reserved for 
     other operations.
     
     TODO potential optimization, mark rows/columns that
     were already processed and skip them here.
     ''' 
     for row_index in range(self.start[0], self.end[0]):
         table_row = self.table[row_index]
         row_start = table_row[self.start[1]]
         if is_text_cell(row_start):
             self._check_fill_title_row(row_index)
Пример #10
0
    def _check_interpret_cell(self, cell, prior_cell, row_index, column_index):
        '''
        Helper function which checks cell type and performs cell translation to strings where
        necessary.

        Returns:
            A tuple of the form '(cell, changed)' where 'changed' indicates if 'cell' differs from
            input.
        '''
        changed = False
        if (not is_empty_cell(cell) and
            not is_text_cell(cell)):
            self.flag_change(self.flags, 'interpreted', (row_index, column_index),
                             self.worksheet, self.FLAGS['converted-to-string'])
            cell = str(cell)
            changed = True
        # If we find a blank cell, propagate the prior title
        elif is_empty_cell(cell):
            self.flag_change(self.flags, 'interpreted', (row_index, column_index),
                             self.worksheet, self.FLAGS['copied-title'])
            cell = prior_cell
            changed = True
        return cell, changed
Пример #11
0
    def _check_interpret_cell(self, cell, prior_cell, row_index, column_index):
        '''
        Helper function which checks cell type and performs cell translation to strings where
        necessary.

        Returns:
            A tuple of the form '(cell, changed)' where 'changed' indicates if 'cell' differs from
            input.
        '''
        changed = False
        if (not is_empty_cell(cell) and not is_text_cell(cell)):
            self.flag_change(self.flags, 'interpreted',
                             (row_index, column_index), self.worksheet,
                             self.FLAGS['converted-to-string'])
            cell = str(cell)
            changed = True
        # If we find a blank cell, propagate the prior title
        elif is_empty_cell(cell):
            self.flag_change(self.flags, 'interpreted',
                             (row_index, column_index), self.worksheet,
                             self.FLAGS['copied-title'])
            cell = prior_cell
            changed = True
        return cell, changed
Пример #12
0
    def _find_block_start(self, table, used_cells, possible_block_start,
                          start_pos, end_pos):
        '''
        Finds the start of a block from a suggested start location.
        This location can be at a lower column but not a lower row.
        
        Note this also finds the lowest row of block_end.
        '''
        current_col = possible_block_start[1]
        block_start = list(possible_block_start)
        block_end = list(possible_block_start)
        repeat = True
        checked_all = False
        # Repeat until we've met satisfactory conditions for
        # catching all edge cases or we've checked all valid
        # block locations
        while (not checked_all and repeat):
            block_end[0] = max(block_end[0], possible_block_start[0])
            block_end[1] = max(block_end[1], current_col)
            table_column = TableTranspose(table)[current_col]
            used_column = TableTranspose(used_cells)[current_col]
            # We need to find a non empty cell before we can stop
            blank_start = is_empty_cell(table_column[possible_block_start[0]])
            # Unless we have assume_complete_blocks set to True
            if blank_start and self.assume_complete_blocks:
                # Found a blank? We're done
                repeat = False
                break
            blank_exited = not blank_start
            blank_repeat_threshold = 3
            parent_title = blank_start or is_text_cell(
                table_column[possible_block_start[0]])
            #TODO refactor code below into new function for easier reading
            # Analyze the beginning columns
            for row_index in range(possible_block_start[0], end_pos[0] + 1):
                # Ensure we catch the edge case of the data reaching the edge of
                # the table -- block_end should then equal end_pos
                if blank_exited:
                    block_end[0] = max(block_end[0], row_index)
                if row_index == end_pos[0] or used_column[row_index]:
                    # We've gone through the whole range
                    checked_all = True
                    break
                elif not blank_exited:
                    blank_exited = not is_empty_cell(table_column[row_index])
                elif is_empty_cell(table_column[row_index]):
                    current_col += 1
                    break
                else:
                    # Go find the left most column that's still valid
                    table_row = table[row_index]
                    used_row = used_cells[row_index]
                    for column_index in range(current_col, start_pos[1] - 1,
                                              -1):
                        if is_empty_cell(table_row[column_index]
                                         ) or used_row[column_index]:
                            break
                        else:
                            block_start[1] = min(block_start[1], column_index)
                # Check if we've seen few enough cells to guess that we have a repeating title
                repeat = blank_start or 1 + row_index - possible_block_start[
                    0] <= blank_repeat_threshold

        return block_start, block_end