def _single_length_title(self, table, row_index, current_col): ''' Returns true if the row is a single length title element with no other row titles. Useful for tracking pre-data titles that belong in their own block. ''' if len(table[row_index]) - current_col <= 0: return False return (is_text_cell(table[row_index][current_col]) and all( not is_text_cell(table[row_index][next_column]) for next_column in xrange(current_col + 1, len(table[row_index]))))
def _single_length_title(self, table, row_index, current_col): ''' Returns true if the row is a single length title element with no other row titles. Useful for tracking pre-data titles that belong in their own block. ''' if len(table[row_index]) - current_col <= 0: return False return (is_text_cell(table[row_index][current_col]) and all(not is_text_cell(table[row_index][next_column]) for next_column in xrange(current_col + 1, len(table[row_index]))))
def _find_block_start(self, table, used_cells, possible_block_start, start_pos, end_pos): ''' Finds the start of a block from a suggested start location. This location can be at a lower column but not a lower row. Note this also finds the lowest row of block_end. ''' current_col = possible_block_start[1] block_start = list(possible_block_start) block_end = list(possible_block_start) repeat = True checked_all = False # Repeat until we've met satisfactory conditions for # catching all edge cases or we've checked all valid # block locations while(not checked_all and repeat): block_end[0] = max(block_end[0], possible_block_start[0]) block_end[1] = max(block_end[1], current_col) table_column = TableTranspose(table)[current_col] used_column = TableTranspose(used_cells)[current_col] # We need to find a non empty cell before we can stop blank_start = is_empty_cell(table_column[possible_block_start[0]]) # Unless we have assume_complete_blocks set to True if blank_start and self.assume_complete_blocks: # Found a blank? We're done repeat = False break blank_exited = not blank_start blank_repeat_threshold = 3 parent_title = blank_start or is_text_cell(table_column[possible_block_start[0]]) #TODO refactor code below into new function for easier reading # Analyze the beginning columns for row_index in range(possible_block_start[0], end_pos[0]+1): # Ensure we catch the edge case of the data reaching the edge of # the table -- block_end should then equal end_pos if blank_exited: block_end[0] = max(block_end[0], row_index) if row_index == end_pos[0] or used_column[row_index]: # We've gone through the whole range checked_all = True break elif not blank_exited: blank_exited = not is_empty_cell(table_column[row_index]) elif is_empty_cell(table_column[row_index]): current_col += 1 break else: # Go find the left most column that's still valid table_row = table[row_index] used_row = used_cells[row_index] for column_index in range(current_col, start_pos[1]-1, -1): if is_empty_cell(table_row[column_index]) or used_row[column_index]: break else: block_start[1] = min(block_start[1], column_index) # Check if we've seen few enough cells to guess that we have a repeating title repeat = blank_start or 1+row_index-possible_block_start[0] <= blank_repeat_threshold return block_start, block_end
def _fill_column_holes(self): ''' Same as _fill_row_holes but for columns. ''' for column_index in range(self.start[1], self.end[1]): table_column = TableTranspose(self.table)[column_index] column_start = table_column[self.start[0]] if is_text_cell(column_start): self._check_fill_title_column(column_index)
def _fill_row_holes(self): ''' Fill any remaining row title cells that are empty. This must be done after the other passes to avoid preemptively filling in empty cells reserved for other operations. ''' for row_index in range(self.start[0], self.max_title_row): table_row = self.table[row_index] row_start = table_row[self.start[1]] if is_text_cell(row_start): self._check_fill_title_row(row_index)
def _fill_row_holes(self): ''' Fill any remaining row title cells that are empty. This must be done after the other passes to avoid preemptively filling in empty cells reserved for other operations. TODO potential optimization, mark rows/columns that were already processed and skip them here. ''' for row_index in range(self.start[0], self.end[0]): table_row = self.table[row_index] row_start = table_row[self.start[1]] if is_text_cell(row_start): self._check_fill_title_row(row_index)
def _check_interpret_cell(self, cell, prior_cell, row_index, column_index): ''' Helper function which checks cell type and performs cell translation to strings where necessary. Returns: A tuple of the form '(cell, changed)' where 'changed' indicates if 'cell' differs from input. ''' changed = False if (not is_empty_cell(cell) and not is_text_cell(cell)): self.flag_change(self.flags, 'interpreted', (row_index, column_index), self.worksheet, self.FLAGS['converted-to-string']) cell = str(cell) changed = True # If we find a blank cell, propagate the prior title elif is_empty_cell(cell): self.flag_change(self.flags, 'interpreted', (row_index, column_index), self.worksheet, self.FLAGS['copied-title']) cell = prior_cell changed = True return cell, changed
def _find_block_start(self, table, used_cells, possible_block_start, start_pos, end_pos): ''' Finds the start of a block from a suggested start location. This location can be at a lower column but not a lower row. Note this also finds the lowest row of block_end. ''' current_col = possible_block_start[1] block_start = list(possible_block_start) block_end = list(possible_block_start) repeat = True checked_all = False # Repeat until we've met satisfactory conditions for # catching all edge cases or we've checked all valid # block locations while (not checked_all and repeat): block_end[0] = max(block_end[0], possible_block_start[0]) block_end[1] = max(block_end[1], current_col) table_column = TableTranspose(table)[current_col] used_column = TableTranspose(used_cells)[current_col] # We need to find a non empty cell before we can stop blank_start = is_empty_cell(table_column[possible_block_start[0]]) # Unless we have assume_complete_blocks set to True if blank_start and self.assume_complete_blocks: # Found a blank? We're done repeat = False break blank_exited = not blank_start blank_repeat_threshold = 3 parent_title = blank_start or is_text_cell( table_column[possible_block_start[0]]) #TODO refactor code below into new function for easier reading # Analyze the beginning columns for row_index in range(possible_block_start[0], end_pos[0] + 1): # Ensure we catch the edge case of the data reaching the edge of # the table -- block_end should then equal end_pos if blank_exited: block_end[0] = max(block_end[0], row_index) if row_index == end_pos[0] or used_column[row_index]: # We've gone through the whole range checked_all = True break elif not blank_exited: blank_exited = not is_empty_cell(table_column[row_index]) elif is_empty_cell(table_column[row_index]): current_col += 1 break else: # Go find the left most column that's still valid table_row = table[row_index] used_row = used_cells[row_index] for column_index in range(current_col, start_pos[1] - 1, -1): if is_empty_cell(table_row[column_index] ) or used_row[column_index]: break else: block_start[1] = min(block_start[1], column_index) # Check if we've seen few enough cells to guess that we have a repeating title repeat = blank_start or 1 + row_index - possible_block_start[ 0] <= blank_repeat_threshold return block_start, block_end