def test_create_table_import_fields_ordering(self): # From: https://github.com/turicas/rows/issues/239 data = [ ["intfield", "textfield", "floatfield"], [1, "str1", 1.2], [2, "str2", 2.3], [3, "str3", 3.4], ] # `fields` parameter on `create_table` must always be in the same order # as the data. fields = OrderedDict( [ ("intfield", rows.fields.IntegerField), ("textfield", rows.fields.TextField), ("floatfield", rows.fields.FloatField), ] ) # Regular case: no `import_fields` specified table = plugins_utils.create_table(data, fields=fields, skip_header=True) self.assertEqual(table.fields, fields) for row, row_data in zip(table, data[1:]): self.assertEqual(row_data, [row.intfield, row.textfield, row.floatfield]) # Special case: `import_fields` has different order from `fields` import_fields = ["textfield", "intfield"] table = plugins_utils.create_table( data, fields=fields, import_fields=import_fields, skip_header=True ) self.assertEqual(list(table.fields.keys()), import_fields) for row, row_data in zip(table, data[1:]): self.assertEqual(row_data[1], row.textfield) self.assertEqual(row_data[0], row.intfield)
def test_create_table_skip_header(self): field_types = OrderedDict([ ('integer', fields.IntegerField), ('string', fields.TextField), ]) data = [['1', 'Álvaro'], ['2', 'turicas'], ['3', 'Justen']] table_1 = plugins_utils.create_table(data, fields=field_types, skip_header=True) table_2 = plugins_utils.create_table(data, fields=field_types, skip_header=False) self.assertEqual(field_types, table_1.fields) self.assertEqual(table_1.fields, table_2.fields) self.assertEqual(len(table_1), 2) self.assertEqual(len(table_2), 3) first_row = {'integer': 1, 'string': 'Álvaro'} second_row = {'integer': 2, 'string': 'turicas'} third_row = {'integer': 3, 'string': 'Justen'} self.assertEqual(dict(table_1[0]._asdict()), second_row) self.assertEqual(dict(table_2[0]._asdict()), first_row) self.assertEqual(dict(table_1[1]._asdict()), third_row) self.assertEqual(dict(table_2[1]._asdict()), second_row) self.assertEqual(dict(table_2[2]._asdict()), third_row)
def test_create_table_import_fields_dont_exist(self): header = ['field1', 'field2', 'field3'] table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'], ['3', 1.23, 'Justen']] error_fields = ['doesnt_exist', 'ruby'] import_fields = list(header)[:-1] + error_fields with self.assertRaises(ValueError) as exception_context: plugins_utils.create_table([header] + table_rows, import_fields=import_fields) self.assertIn(exception_context.exception.args[0], possible_field_names_errors(error_fields))
def test_create_table_import_fields_dont_exist(self): header = ['field1', 'field2', 'field3'] table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'], ['3', 1.23, 'Justen']] error_fields = ['doesnt_exist', 'ruby'] import_fields = list(header)[:-1] + error_fields with self.assertRaises(ValueError) as exception_context: plugins_utils.create_table([header] + table_rows, import_fields=import_fields) self.assertIn(exception_context.exception.message, possible_field_names_errors(error_fields))
def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): workbook = load_workbook(filename_or_fobj) if sheet_name is None: sheet_name = workbook.sheetnames[sheet_index] sheet = workbook.get_sheet_by_name(sheet_name) start_row, end_row = max(start_row, sheet.min_row), sheet.max_row start_col, end_col = max(start_column, sheet.min_column), sheet.max_column table_rows = [[ _cell_to_python(sheet.cell(row=row_index, column=col_index)) for col_index in range(start_col, end_col + 1) ] for row_index in range(start_row, end_row + 1)] filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True) metadata = { 'imported_from': 'xlsx', 'filename': filename, 'sheet_name': sheet_name, } return create_table(table_rows, meta=metadata, *args, **kwargs)
def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): workbook = load_workbook(filename_or_fobj) if sheet_name is None: sheet_name = workbook.sheetnames[sheet_index] sheet = workbook.get_sheet_by_name(sheet_name) # Get sheet header header = [] last_column = start_column header_value = _get_cell_value(sheet, start_row, last_column) while header_value: header.append(header_value) last_column += 1 header_value = _get_cell_value(sheet, start_row, last_column) last_column -= 1 # Get sheet rows based on `last_column` defined in 'get sheet header' row_pos = start_row + 1 all_rows = [] row = _read_row(sheet, row_pos, last_column) while any(row): all_rows.append(row) row_pos += 1 row = _read_row(sheet, row_pos, last_column) filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True) metadata = {'imported_from': 'xlsx', 'filename': filename, } return create_table([header] + all_rows, meta=metadata, *args, **kwargs)
def extract_ibama_pdf(filename): """Extract all pages from a "Autuação Ambiental" PDF, generated by IBAMA This function will extract each page at once so we can use rows' `starts_after/ends_before`. It's using `pdf_table_lines` instead of `import_from_pdf` because it's faster and we can fix the table lines before importing data as a `rows.Table`. """ final = [] total_pages = rows.plugins.pdf.number_of_pages(filename) for page_number in range(1, total_pages + 1): print('Processing page {}...'.format(page_number)) with open(filename, mode='rb') as fobj: table_rows = rows.plugins.pdf.pdf_table_lines( fobj, page_numbers=(page_number, ), starts_after='DIRETORIA DE PROTEÇÃO AMBIENTAL', ends_before=re.compile('Pag [0-9]+/[0-9]+'), algorithm='rects-boundaries', ) final.extend(fix_rows(table_rows, header=page_number == 1)) table = create_table(final, meta={ 'imported_from': 'pdf', 'filename': filename }, force_types={ 'data_infracao': BRDateField, 'valor_multa': BRMoneyField }) return table
def import_from_postgresql( connection_or_uri, table_name="table1", query=None, query_args=None, close_connection=False, *args, **kwargs ): if query is None: if not _valid_table_name(table_name): raise ValueError("Invalid table name: {}".format(table_name)) query = SQL_SELECT_ALL.format(table_name=table_name) if query_args is None: query_args = tuple() connection = _get_connection(connection_or_uri) cursor = connection.cursor() cursor.execute(query, query_args) table_rows = list(cursor.fetchall()) # TODO: make it lazy header = [six.text_type(info[0]) for info in cursor.description] cursor.close() connection.commit() # WHY? meta = {"imported_from": "postgresql", "source": connection_or_uri} if close_connection: connection.close() return create_table([header] + table_rows, meta=meta, *args, **kwargs)
def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath, encoding='utf-8', *args, **kwargs): types = set([type(rows_xpath)] + \ [type(xpath) for xpath in fields_xpath.values()]) if types != set([six.text_type]): raise TypeError('XPath must be {}'.format(six.text_type.__name__)) filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') xml = fobj.read().decode(encoding) tree = tree_from_string(xml) row_elements = tree.xpath(rows_xpath) header = list(fields_xpath.keys()) row_data = _get_row_data(fields_xpath) result_rows = list(map(row_data, row_elements)) meta = { 'imported_from': 'xpath', 'filename': filename, 'encoding': encoding, } return create_table([header] + result_rows, meta=meta, *args, **kwargs)
def import_from_dicts(data, samples=None, *args, **kwargs): """Import data from a iterable of dicts The algorithm will use the `samples` first `dict`s to determine the field names (if `samples` is `None` all `dict`s will be used). """ data = iter(data) cached_rows, headers = [], [] for index, row in enumerate(data, start=1): cached_rows.append(row) for key in row.keys(): if key not in headers: headers.append(key) if samples and index == samples: break data_rows = ([row.get(header, None) for header in headers] for row in chain(cached_rows, data)) kwargs["samples"] = samples meta = {"imported_from": "dicts"} return create_table(chain([headers], data_rows), meta=meta, *args, **kwargs)
def import_from_sqlite(filename_or_connection, table_name="table1", query=None, query_args=None, *args, **kwargs): """Return a rows.Table with data from SQLite database.""" connection = _get_connection(filename_or_connection) cursor = connection.cursor() if query is None: if not _valid_table_name(table_name): raise ValueError("Invalid table name: {}".format(table_name)) query = SQL_SELECT_ALL.format(table_name=table_name) if query_args is None: query_args = tuple() table_rows = list(cursor.execute(query, query_args)) # TODO: may be lazy header = [six.text_type(info[0]) for info in cursor.description] cursor.close() # TODO: should close connection also? meta = {"imported_from": "sqlite", "filename": filename_or_connection} return create_table([header] + table_rows, meta=meta, *args, **kwargs)
def import_from_sqlite( filename_or_connection, table_name="table1", query=None, query_args=None, *args, **kwargs ): """Return a rows.Table with data from SQLite database.""" connection = _get_connection(filename_or_connection) cursor = connection.cursor() if query is None: if not _valid_table_name(table_name): raise ValueError("Invalid table name: {}".format(table_name)) query = SQL_SELECT_ALL.format(table_name=table_name) if query_args is None: query_args = tuple() table_rows = list(cursor.execute(query, query_args)) # TODO: may be lazy header = [six.text_type(info[0]) for info in cursor.description] cursor.close() # TODO: should close connection also? meta = {"imported_from": "sqlite", "filename": filename_or_connection} return create_table([header] + table_rows, meta=meta, *args, **kwargs)
def test_create_table_empty_data(self): header = ['first', 'first', 'first'] table_rows = [] table = plugins_utils.create_table([header] + table_rows) self.assertEqual(list(table.fields.keys()), ['first', 'first_2', 'first_3']) self.assertEqual(len(table), 0)
def import_from_sqlite(filename_or_connection, table_name='table1', query=None, query_args=None, *args, **kwargs): connection = _get_connection(filename_or_connection) cursor = connection.cursor() if query is None: if not _valid_table_name(table_name): raise ValueError('Invalid table name: {}'.format(table_name)) query = SQL_SELECT_ALL.format(table_name=table_name) if query_args is None: query_args = tuple() table_rows = list(cursor.execute(query, query_args)) # TODO: may be lazy header = [six.text_type(info[0]) for info in cursor.description] cursor.close() # TODO: should close connection also? meta = { 'imported_from': 'sqlite', 'filename': filename_or_connection, } return create_table([header] + table_rows, meta=meta, *args, **kwargs)
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): filename, _ = get_filename_and_fobj(filename_or_fobj, mode='rb') book = xlrd.open_workbook(filename, formatting_info=True) if sheet_name is not None: sheet = book.sheet_by_name(sheet_name) else: sheet = book.sheet_by_index(sheet_index) # TODO: may re-use Excel data types # Get header and rows table_rows = [[ cell_value(sheet, row_index, column_index) for column_index in range(start_column, sheet.ncols) ] for row_index in range(start_row, sheet.nrows)] meta = { 'imported_from': 'xls', 'filename': filename, 'sheet_name': sheet.name, } return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_dicts(data, samples=None, *args, **kwargs): """Import data from a iterable of dicts The algorithm will use the `samples` first `dict`s to determine the field names (if `samples` is `None` all `dict`s will be used). """ data = iter(data) cached_rows, headers = [], [] for index, row in enumerate(data, start=1): cached_rows.append(row) for key in row.keys(): if key not in headers: headers.append(key) if samples and index == samples: break data_rows = ( [row.get(header, None) for header in headers] for row in chain(cached_rows, data) ) kwargs["samples"] = samples meta = {"imported_from": "dicts"} return create_table(chain([headers], data_rows), meta=meta, *args, **kwargs)
def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None, sample_size=8192, *args, **kwargs): '''Import data from a CSV file If a file-like object is provided it MUST be in binary mode, like in `open(filename, mode='rb')`. ''' filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') if dialect is None: cursor = fobj.tell() dialect = discover_dialect(fobj.read(sample_size), encoding) fobj.seek(cursor) reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect) meta = { 'imported_from': 'csv', 'filename': filename, 'encoding': encoding, } return create_table(reader, meta=meta, *args, **kwargs)
def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath, encoding="utf-8", *args, **kwargs): types = set([type(rows_xpath)] + [type(xpath) for xpath in fields_xpath.values()]) if types != set([six.text_type]): raise TypeError("XPath must be {}".format(six.text_type.__name__)) filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") xml = fobj.read().decode(encoding) tree = tree_from_string(xml) row_elements = tree.xpath(rows_xpath) header = list(fields_xpath.keys()) row_data = _get_row_data(fields_xpath) result_rows = list(map(row_data, row_elements)) meta = { "imported_from": "xpath", "filename": filename, "encoding": encoding } return create_table([header] + result_rows, meta=meta, *args, **kwargs)
def import_from_pdf( filename_or_fobj, page_numbers=None, starts_after=None, ends_before=None, backend=None, algorithm="y-groups", x_threshold=0.5, y_threshold=0.5, *args, **kwargs ): backend = backend or default_backend() meta = {"imported_from": "pdf"} table_rows = pdf_table_lines( filename_or_fobj, page_numbers, starts_after=starts_after, ends_before=ends_before, algorithm=algorithm, x_threshold=x_threshold, y_threshold=y_threshold, backend=backend, ) return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_postgresql(connection_or_uri, table_name="table1", query=None, query_args=None, close_connection=False, *args, **kwargs): if query is None: if not _valid_table_name(table_name): raise ValueError("Invalid table name: {}".format(table_name)) query = SQL_SELECT_ALL.format(table_name=table_name) if query_args is None: query_args = tuple() connection = _get_connection(connection_or_uri) cursor = connection.cursor() cursor.execute(query, query_args) table_rows = list(cursor.fetchall()) # TODO: make it lazy header = [six.text_type(info[0]) for info in cursor.description] cursor.close() connection.commit() # WHY? meta = {"imported_from": "postgresql", "source": connection_or_uri} if close_connection: connection.close() return create_table([header] + table_rows, meta=meta, *args, **kwargs)
def test_create_table_import_fields(self): header = ['field1', 'field2', 'field3'] table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'], ['3', 1.23, 'Justen']] table = plugins_utils.create_table([header] + table_rows, import_fields=None) self.assertEqual(list(table.fields.keys()), header) self.assertEqual(table[0].field1, 1) self.assertEqual(table[0].field2, 3.14) self.assertEqual(table[0].field3, 'Álvaro') import_fields = ['field3', 'field2'] table = plugins_utils.create_table([header] + table_rows, import_fields=import_fields) self.assertEqual(list(table.fields.keys()), import_fields) self.assertEqual(table[0]._asdict(), OrderedDict([('field3', 'Álvaro'), ('field2', 3.14)]))
def test_create_table_import_fields(self): header = ['field1', 'field2', 'field3'] table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'], ['3', 1.23, 'Justen']] table = plugins_utils.create_table([header] + table_rows, import_fields=None) self.assertEqual(table.fields.keys(), header) self.assertEqual(table[0].field1, 1) self.assertEqual(table[0].field2, 3.14) self.assertEqual(table[0].field3, 'Álvaro') import_fields = ['field3', 'field2'] table = plugins_utils.create_table([header] + table_rows, import_fields=import_fields) self.assertEqual(table.fields.keys(), import_fields) self.assertEqual(table[0]._asdict(), OrderedDict([('field3', 'Álvaro'), ('field2', 3.14)]))
def test_create_table_force_types(self): header = ['field1', 'field2', 'field3'] table_rows = [['1', '3.14', 'Álvaro'], ['2', '2.71', 'turicas'], ['3', '1.23', 'Justen']] force_types = {'field2': rows.fields.DecimalField} table = plugins_utils.create_table([header] + table_rows, force_types=force_types) for field_name, field_type in force_types.items(): self.assertEqual(table.fields[field_name], field_type)
def test_create_table_import_fields_dont_exist(self): header = ["field1", "field2", "field3"] table_rows = [ ["1", 3.14, "Álvaro"], ["2", 2.71, "turicas"], ["3", 1.23, "Justen"], ] error_fields = ["doesnt_exist", "ruby"] import_fields = list(header)[:-1] + error_fields with self.assertRaises(ValueError) as exception_context: plugins_utils.create_table( [header] + table_rows, import_fields=import_fields ) self.assertIn( exception_context.exception.args[0], possible_field_names_errors(error_fields), )
def import_from_txt( filename_or_fobj, encoding="utf-8", frame_style=FRAME_SENTINEL, *args, **kwargs ): """Return a rows.Table created from imported TXT file.""" # TODO: (maybe) # enable parsing of non-fixed-width-columns # with old algorithm - that would just split columns # at the vertical separator character for the frame. # (if doing so, include an optional parameter) # Also, this fixes an outstanding unreported issue: # trying to parse tables which fields values # included a Pipe char - "|" - would silently # yield bad results. filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") raw_contents = fobj.read().decode(encoding).rstrip("\n") if frame_style is FRAME_SENTINEL: frame_style = _guess_frame_style(raw_contents) else: frame_style = _parse_frame_style(frame_style) contents = raw_contents.splitlines() del raw_contents if frame_style != "None": contents = contents[1:-1] del contents[1] else: # the table is possibly generated from other source. # check if the line we reserve as a separator is realy empty. if not contents[1].strip(): del contents[1] col_positions = _parse_col_positions(frame_style, contents[0]) table_rows = [ [ row[start + 1 : end].strip() for start, end in zip(col_positions, col_positions[1:]) ] for row in contents ] # # Variable columns - old behavior: # table_rows = [[value.strip() for value in row.split(vertical_char)[1:-1]] # for row in contents] meta = { "imported_from": "txt", "filename": filename, "encoding": encoding, "frame_style": frame_style, } return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_dicts(data, *args, **kwargs): """Import data from a list of dicts.""" headers = set() for row in data: headers.update(row.keys()) headers = sorted(list(headers)) data = [[row.get(header, None) for header in headers] for row in data] meta = {'imported_from': 'dicts', } return create_table([headers] + data, meta=meta, *args, **kwargs)
def import_from_txt(filename_or_fobj, encoding='utf-8', frame_style=FRAME_SENTINEL, *args, **kwargs): """Return a rows.Table created from imported TXT file.""" # TODO: (maybe) # enable parsing of non-fixed-width-columns # with old algorithm - that would just split columns # at the vertical separator character for the frame. # (if doing so, include an optional parameter) # Also, this fixes an outstanding unreported issue: # trying to parse tables which fields values # included a Pipe char - "|" - would silently # yield bad results. filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') raw_contents = fobj.read().decode(encoding).rstrip('\n') if frame_style is FRAME_SENTINEL: frame_style = _guess_frame_style(raw_contents) else: frame_style = _parse_frame_style(frame_style) contents = raw_contents.splitlines() del raw_contents if frame_style != 'None': contents = contents[1:-1] del contents[1] else: # the table is possibly generated from other source. # check if the line we reserve as a separator is realy empty. if not contents[1].strip(): del contents[1] col_positions = _parse_col_positions(frame_style, contents[0]) table_rows = [[ row[start + 1:end].strip() for start, end in zip(col_positions, col_positions[1:]) ] for row in contents] # # Variable columns - old behavior: # table_rows = [[value.strip() for value in row.split(vertical_char)[1:-1]] # for row in contents] meta = { 'imported_from': 'txt', 'filename': filename, 'encoding': encoding, 'frame_style': frame_style } return create_table(table_rows, meta=meta, *args, **kwargs)
def test_create_table_repeated_field_names(self): header = ['first', 'first', 'first'] table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'], ['3', 1.23, 'Justen']] table = plugins_utils.create_table([header] + table_rows) self.assertEqual(table.fields.keys(), ['first', 'first_2', 'first_3']) self.assertEqual(table[0].first, 1) self.assertEqual(table[0].first_2, 3.14) self.assertEqual(table[0].first_3, 'Álvaro') header = ['field', '', 'field'] table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'], ['3', 1.23, 'Justen']] table = plugins_utils.create_table([header] + table_rows) self.assertEqual(table.fields.keys(), ['field', 'field_1', 'field_2']) self.assertEqual(table[0].field, 1) self.assertEqual(table[0].field_1, 3.14) self.assertEqual(table[0].field_2, 'Álvaro')
def test_create_table_repeated_field_names(self): header = ['first', 'first', 'first'] table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'], ['3', 1.23, 'Justen']] table = plugins_utils.create_table([header] + table_rows) self.assertEqual(list(table.fields.keys()), ['first', 'first_2', 'first_3']) self.assertEqual(table[0].first, 1) self.assertEqual(table[0].first_2, 3.14) self.assertEqual(table[0].first_3, 'Álvaro') header = ['field', '', 'field'] table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'], ['3', 1.23, 'Justen']] table = plugins_utils.create_table([header] + table_rows) self.assertEqual(list(table.fields.keys()), ['field', 'field_1', 'field_2']) self.assertEqual(table[0].field, 1) self.assertEqual(table[0].field_1, 3.14) self.assertEqual(table[0].field_2, 'Álvaro')
def import_from_parquet(filename, encoding='utf-8', *args, **kwargs): 'Import data from a Parquet file' # TODO: should be able to used fobj also data, field_names = parquet.dump(filename, OPTIONS, _callback) length = len(data[field_names[0]]) table_rows = [[data[field_name][index] for field_name in field_names] for index in range(length)] meta = {'imported_from': 'parquet', 'filename': filename,} return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
def test_create_table_import_fields(self): header = ["field1", "field2", "field3"] table_rows = [ ["1", 3.14, "Álvaro"], ["2", 2.71, "turicas"], ["3", 1.23, "Justen"], ] table = plugins_utils.create_table([header] + table_rows, import_fields=None) self.assertEqual(list(table.fields.keys()), header) self.assertEqual(table[0].field1, 1) self.assertEqual(table[0].field2, 3.14) self.assertEqual(table[0].field3, "Álvaro") import_fields = ["field3", "field2"] table = plugins_utils.create_table( [header] + table_rows, import_fields=import_fields ) self.assertEqual(list(table.fields.keys()), import_fields) self.assertEqual( table[0]._asdict(), OrderedDict([("field3", "Álvaro"), ("field2", 3.14)]) )
def import_from_dicts(data, *args, **kwargs): 'Import data from a list of dicts' headers = set() for row in data: headers.update(row.keys()) headers = sorted(list(headers)) data = [[row.get(header, None) for header in headers] for row in data] meta = {'imported_from': 'dicts', } return create_table([headers] + data, meta=meta, *args, **kwargs)
def test_create_table_skip_header(self): field_types = OrderedDict([('integer', fields.IntegerField), ('string', fields.TextField),]) data = [['1', 'Álvaro'], ['2', 'turicas'], ['3', 'Justen']] table_1 = plugins_utils.create_table(data, fields=field_types, skip_header=True) table_2 = plugins_utils.create_table(data, fields=field_types, skip_header=False) self.assertEqual(field_types, table_1.fields) self.assertEqual(table_1.fields, table_2.fields) self.assertEqual(len(table_1), 2) self.assertEqual(len(table_2), 3) first_row = {'integer': 1, 'string': 'Álvaro'} second_row = {'integer': 2, 'string': 'turicas'} third_row = {'integer': 3, 'string': 'Justen'} self.assertEqual(dict(table_1[0]._asdict()), second_row) self.assertEqual(dict(table_2[0]._asdict()), first_row) self.assertEqual(dict(table_1[1]._asdict()), third_row) self.assertEqual(dict(table_2[1]._asdict()), second_row) self.assertEqual(dict(table_2[2]._asdict()), third_row)
def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs): 'Import data from a JSON file' kwargs['encoding'] = encoding filename, fobj = get_filename_and_fobj(filename_or_fobj) json_obj = json.load(fobj, encoding=encoding) field_names = json_obj[0].keys() table_rows = [[item[key] for key in field_names] for item in json_obj] data = [field_names] + table_rows meta = {'imported_from': 'json', 'filename': filename, } return create_table(data, meta=meta, *args, **kwargs)
def transpose(table, fields_column, *args, **kwargs): field_names = [] new_rows = [{} for _ in range(len(table.fields) - 1)] for row in table: row = row._asdict() field_name = row[fields_column] field_names.append(field_name) del row[fields_column] for index, value in enumerate(row.values()): new_rows[index][field_name] = value table_rows = [[row[field_name] for field_name in field_names] for row in new_rows] return create_table([field_names] + table_rows, *args, **kwargs)
def import_from_sqlite(filename_or_connection, table_name='rows', query=None, *args, **kwargs): connection = _get_connection(filename_or_connection) cursor = connection.cursor() sql = query if query else SQL_SELECT_ALL.format(table_name=table_name) cursor.execute(sql) header = [info[0] for info in cursor.description] table_rows = list(cursor) # TODO: may not put everything in memory cursor.close() meta = {'imported_from': 'sqlite', 'filename': filename_or_connection, } return create_table([header] + table_rows, meta=meta, *args, **kwargs)
def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath, encoding='utf-8', *args, **kwargs): filename, fobj = get_filename_and_fobj(filename_or_fobj) kwargs['encoding'] = encoding xml = fobj.read().decode(encoding) tree = tree_from_string(xml) row_elements = tree.xpath(rows_xpath) header = fields_xpath.keys() result_rows = [_get_row_data(row, fields_xpath) for row in row_elements] meta = {'imported_from': 'xpath', 'filename': filename,} return create_table([header] + result_rows, meta=meta, *args, **kwargs)
def test_create_table_skip_header(self): field_types = OrderedDict( [("integer", fields.IntegerField), ("string", fields.TextField)] ) data = [["1", "Álvaro"], ["2", "turicas"], ["3", "Justen"]] table_1 = plugins_utils.create_table(data, fields=field_types, skip_header=True) table_2 = plugins_utils.create_table( data, fields=field_types, skip_header=False ) self.assertEqual(field_types, table_1.fields) self.assertEqual(table_1.fields, table_2.fields) self.assertEqual(len(table_1), 2) self.assertEqual(len(table_2), 3) first_row = {"integer": 1, "string": "Álvaro"} second_row = {"integer": 2, "string": "turicas"} third_row = {"integer": 3, "string": "Justen"} self.assertEqual(dict(table_1[0]._asdict()), second_row) self.assertEqual(dict(table_2[0]._asdict()), first_row) self.assertEqual(dict(table_1[1]._asdict()), third_row) self.assertEqual(dict(table_2[1]._asdict()), second_row) self.assertEqual(dict(table_2[2]._asdict()), third_row)
def import_from_json(filename_or_fobj, encoding="utf-8", *args, **kwargs): """Import a JSON file or file-like object into a `rows.Table`. If a file-like object is provided it MUST be open in text (non-binary) mode on Python 3 and could be open in both binary or text mode on Python 2. """ filename, fobj = get_filename_and_fobj(filename_or_fobj) json_obj = json.load(fobj, encoding=encoding) field_names = list(json_obj[0].keys()) table_rows = [[item[key] for key in field_names] for item in json_obj] meta = {"imported_from": "json", "filename": filename, "encoding": encoding} return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
def test_create_table_force_types(self): header = ["field1", "field2", "field3"] table_rows = [ ["1", "3.14", "Álvaro"], ["2", "2.71", "turicas"], ["3", "1.23", "Justen"], ] force_types = {"field2": rows.fields.DecimalField} table = plugins_utils.create_table( [header] + table_rows, force_types=force_types ) for field_name, field_type in force_types.items(): self.assertEqual(table.fields[field_name], field_type)
def import_from_txt(filename_or_fobj, encoding='utf-8', *args, **kwargs): # TODO: should be able to change DASH, PLUS and PIPE filename, fobj = get_filename_and_fobj(filename_or_fobj) kwargs['encoding'] = encoding contents = fobj.read().decode(encoding).strip().splitlines() # remove '+----+----+' lines contents = contents[1:-1] del contents[1] table_rows = [[value.strip() for value in row.split(PIPE)[1:-1]] for row in contents] meta = {'imported_from': 'txt', 'filename': filename,} return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_html( filename_or_fobj, encoding="utf-8", index=0, ignore_colspan=True, preserve_html=False, properties=False, table_tag="table", row_tag="tr", column_tag="td|th", *args, **kwargs ): """Return rows.Table from HTML file.""" filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") html = fobj.read().decode(encoding) html_tree = document_fromstring(html) tables = html_tree.xpath("//{}".format(table_tag)) table = tables[index] strip_tags(table, "thead") strip_tags(table, "tbody") row_elements = table.xpath(row_tag) table_rows = [ _get_row( row, column_tag=column_tag, preserve_html=preserve_html, properties=properties, ) for row in row_elements ] if properties: table_rows[0][-1] = "properties" if preserve_html and kwargs.get("fields", None) is None: # The field names will be the first table row, so we need to strip HTML # from it even if `preserve_html` is `True` (it's `True` only for rows, # not for the header). table_rows[0] = list(map(_extract_node_text, row_elements[0])) if ignore_colspan: max_columns = max(map(len, table_rows)) table_rows = [row for row in table_rows if len(row) == max_columns] meta = {"imported_from": "html", "filename": filename, "encoding": encoding} return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): filename, _ = get_filename_and_fobj(filename_or_fobj) book = xlrd.open_workbook(filename, formatting_info=True) if sheet_name is not None: sheet = book.sheet_by_name(sheet_name) else: sheet = book.sheet_by_index(sheet_index) # TODO: may re-use Excel data types # Get field names # TODO: may use sheet.col_values or even sheet.ncols column_count = 0 header = [] column_value = cell_value(sheet, start_row, start_column + column_count) while column_value: header.append(column_value) column_count += 1 column_value = cell_value(sheet, start_row, start_column + column_count) # Get sheet rows # TODO: may use sheel.col_slice or even sheet.nrows table_rows = [] row_count = 0 start_row += 1 cell_is_empty = False while not cell_is_empty: row = [ cell_value(sheet, start_row + row_count, start_column + column_index) for column_index in range(column_count) ] cell_is_empty = not any(row) if not cell_is_empty: table_rows.append(row) row_count += 1 meta = { 'imported_from': 'xls', 'filename': filename, } return create_table([header] + table_rows, meta=meta, *args, **kwargs)
def test_create_table_repeated_field_names(self): header = ["first", "first", "first"] table_rows = [ ["1", 3.14, "Álvaro"], ["2", 2.71, "turicas"], ["3", 1.23, "Justen"], ] table = plugins_utils.create_table([header] + table_rows) self.assertEqual(list(table.fields.keys()), ["first", "first_2", "first_3"]) self.assertEqual(table[0].first, 1) self.assertEqual(table[0].first_2, 3.14) self.assertEqual(table[0].first_3, "Álvaro") header = ["field", "", "field"] table_rows = [ ["1", 3.14, "Álvaro"], ["2", 2.71, "turicas"], ["3", 1.23, "Justen"], ] table = plugins_utils.create_table([header] + table_rows) self.assertEqual(list(table.fields.keys()), ["field", "field_1", "field_2"]) self.assertEqual(table[0].field, 1) self.assertEqual(table[0].field_1, 3.14) self.assertEqual(table[0].field_2, "Álvaro")
def import_from_parquet(filename_or_fobj, *args, **kwargs): 'Import data from a Parquet file' filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') # TODO: should look into `schema.converted_type` also types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type]) for schema in parquet._read_footer(fobj).schema if schema.type is not None]) header = list(types.keys()) table_rows = list(parquet.reader(fobj)) # TODO: be lazy meta = {'imported_from': 'parquet', 'filename': filename,} return create_table([header] + table_rows, meta=meta, force_types=types, *args, **kwargs)
def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None, *args, **kwargs): 'Import data from a CSV file' filename, fobj = get_filename_and_fobj(filename_or_fobj) if dialect is None: sample = fobj.readline().decode(encoding) dialect = unicodecsv.Sniffer().sniff(sample) fobj.seek(0) kwargs['encoding'] = encoding csv_reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect) meta = {'imported_from': 'csv', 'filename': filename,} return create_table(csv_reader, meta=meta, *args, **kwargs)
def import_from_xls( filename_or_fobj, sheet_name=None, sheet_index=0, start_row=None, start_column=None, end_row=None, end_column=None, *args, **kwargs ): """Return a rows.Table created from imported XLS file.""" filename, _ = get_filename_and_fobj(filename_or_fobj, mode="rb") book = xlrd.open_workbook(filename, formatting_info=True) if sheet_name is not None: sheet = book.sheet_by_name(sheet_name) else: sheet = book.sheet_by_index(sheet_index) # TODO: may re-use Excel data types # Get header and rows # xlrd library reads rows and columns starting from 0 and ending on # sheet.nrows/ncols - 1. rows accepts the same pattern # The xlrd library reads rows and columns starting from 0 and ending on # sheet.nrows/ncols - 1. rows also uses 0-based indexes, so no # transformation is needed min_row, min_column = get_table_start(sheet) max_row, max_column = sheet.nrows - 1, sheet.ncols - 1 # TODO: consider adding a parameter `ignore_padding=True` and when it's # True, consider `start_row` starting from `min_row` and `start_column` # starting from `min_col`. start_row = start_row if start_row is not None else min_row end_row = end_row if end_row is not None else max_row start_column = start_column if start_column is not None else min_column end_column = end_column if end_column is not None else max_column table_rows = [ [ cell_value(sheet, row_index, column_index) for column_index in range(start_column, end_column + 1) ] for row_index in range(start_row, end_row + 1) ] meta = {"imported_from": "xls", "filename": filename, "sheet_name": sheet.name} return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs): '''Import a JSON file or file-like object into a `rows.Table` If a file-like object is provided it MUST be open in text (non-binary) mode on Python 3 and could be open in both binary or text mode on Python 2. ''' filename, fobj = get_filename_and_fobj(filename_or_fobj) json_obj = json.load(fobj, encoding=encoding) field_names = list(json_obj[0].keys()) table_rows = [[item[key] for key in field_names] for item in json_obj] meta = {'imported_from': 'json', 'filename': filename, 'encoding': encoding,} return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)