def __init__(self, resource=None, fields=None, sheet=0, encoding=None, skip_rows=0, has_header=True): """Creates a XLSX spreadsheet data source stream. Attributes: * resource: file name, URL or file-like object * sheet: sheet index number (as int) or sheet name * has_header: flag determining whether first line contains header or not. ``True`` by default. """ if isinstance(resource, openpyxl.Workbook): self.workbook = resource else: self.workbook = _load_workbook(resource) if isinstance(sheet, int): self.sheet = self.workbook.worksheets[sheet] elif isinstance(sheet, str): self.sheet = self.workbook[sheet] else: raise ArgumentError('sheet has to be a string or an integer') if has_header: self.first_row = skip_rows + 1 else: self.first_row = skip_rows if fields: self.fields = fields else: rows = enumerate(self.sheet.rows) first_row = next(dropwhile(lambda x: x[0] < self.first_row, rows))[1] if has_header: header_row = next(self.sheet.rows) names = [str(c.value) for c in header_row] else: names = ['col%d' % i for i in range(len(first_row))] self.fields = FieldList() for name, cell in zip(names, first_row): if cell.is_date: storage_type = 'date' else: storage_type = CELL_TYPES.get(cell.data_type, 'unknown') field = Field(name, storage_type=storage_type) self.fields.append(field)
def __init__(self, resource=None, fields=None, sheet=0, encoding=None, skip_rows=0, has_header=True, stop_empty_line=False): """Creates a XLSX spreadsheet data source stream. Attributes: * fields: `bubbles.metadata.FieldList` to use instead of auto-detection * resource: file name, URL or file-like object * sheet: sheet index number (as int) or sheet name * has_header: flag determining whether first line contains header or not. ``True`` by default. * stop_empty_line: flag to stop iteration over rows at the first encounter with an empty line. As XLSX files can contain millions or rows, this might cause very long iterations, especially if all the lines are empty past a certain point """ if isinstance(resource, openpyxl.Workbook): self.workbook = resource else: self.workbook = _load_workbook(resource) if isinstance(sheet, int): self.sheet = self.workbook.worksheets[sheet] elif isinstance(sheet, str): self.sheet = self.workbook[sheet] else: raise ArgumentError('sheet has to be a string or an integer') if has_header: self.first_row = skip_rows + 1 else: self.first_row = skip_rows self.stop_empty_line = stop_empty_line if fields: self.fields = fields else: rows = enumerate(self.sheet.rows) first_row = next(dropwhile(lambda x: x[0] < self.first_row, rows))[1] if has_header: header_rows = enumerate(self.sheet.rows) header_row = next(dropwhile(lambda x: x[0] < (self.first_row - 1), header_rows))[1] # fetch names, replace line breaks by spaces in case of # manual line wrapping names = [' '.join(str(c.value).split()) for c in header_row] else: names = ['col%d' % i for i in range(len(first_row))] self.fields = FieldList() for name, cell in zip(names, first_row): if cell.is_date: storage_type = 'date' else: storage_type = CELL_TYPES.get(cell.data_type, 'unknown') field = Field(name, storage_type=storage_type) self.fields.append(field)
def test_skip(self): obj = XLSXObject(data_path("data.xlsx"), FieldList("number", "name"), skip_rows=2) self.assertEqual(["number", "name"], obj.fields.names()) rows = list(obj.rows()) self.assertEqual(2, len(obj)) self.assertEqual(2, len(rows)) self.assertSequenceEqual([3.0, "Cecil"], rows[0])
def test_no_header(self): with self.assertRaises(ArgumentError): obj = CSVSource(data_path("fruits-sk.csv"), read_header=False) fields = FieldList("id", "fruit", "type") obj = CSVSource(data_path("fruits-sk.csv"), read_header=False, fields=fields) self.assertEqual(["id", "fruit", "type"], obj.fields.names()) rows = list(obj.rows()) self.assertEqual(17, len(rows)) self.assertEqual(["id", "fruit", "type"], rows[0]) obj.release()
class XLSXObject(DataObject): __identifier__ = "xlsx" def __init__(self, resource=None, fields=None, sheet=0, encoding=None, skip_rows=0, has_header=True): """Creates a XLSX spreadsheet data source stream. Attributes: * resource: file name, URL or file-like object * sheet: sheet index number (as int) or sheet name * has_header: flag determining whether first line contains header or not. ``True`` by default. """ if isinstance(resource, openpyxl.Workbook): self.workbook = resource else: self.workbook = _load_workbook(resource) if isinstance(sheet, int): self.sheet = self.workbook.worksheets[sheet] elif isinstance(sheet, str): self.sheet = self.workbook[sheet] else: raise ArgumentError('sheet has to be a string or an integer') if has_header: self.first_row = skip_rows + 1 else: self.first_row = skip_rows if fields: self.fields = fields else: rows = enumerate(self.sheet.rows) first_row = next(dropwhile(lambda x: x[0] < self.first_row, rows))[1] if has_header: header_row = next(self.sheet.rows) names = [str(c.value) for c in header_row] else: names = ['col%d' % i for i in range(len(first_row))] self.fields = FieldList() for name, cell in zip(names, first_row): if cell.is_date: storage_type = 'date' else: storage_type = CELL_TYPES.get(cell.data_type, 'unknown') field = Field(name, storage_type=storage_type) self.fields.append(field) def _nrows(self): return sum([1 for _ in self.sheet.rows]) def representations(self): return ["rows", "records"] def __len__(self): return self._nrows() - self.first_row def rows(self): if not self.fields: raise RuntimeError("Fields are not initialized") field_count = len(self.fields) rows = enumerate(self.sheet.rows) rows = dropwhile(lambda x: x[0] < self.first_row, rows) for _, row in rows: yield tuple([c.value for c in row[:field_count]]) def records(self): fields = self.fields.names() for row in self.rows(): yield dict(zip(fields, row)) def is_consumable(self): return False