def import_from_html(filename_or_fobj, encoding='utf-8', index=0, ignore_colspan=True, preserve_html=False, row_tag='tr', column_tag='td|th', *args, **kwargs): # TODO: unescape before returning: html_parser.unescape(html) # TODO: lxml -> unicode? filename, fobj = get_filename_and_fobj(filename_or_fobj) kwargs['encoding'] = encoding html = fobj.read().decode(encoding) html_tree = document_fromstring(html) tables = html_tree.xpath('//table') table = tables[index] strip_tags(table, 'thead') strip_tags(table, 'tbody') row_elements = table.xpath(row_tag) if not preserve_html: table_rows = [[value_element.text_content().strip() for value_element in row.xpath(column_tag)] for row in row_elements] else: table_rows = [[_get_content(value_element) for value_element in row.xpath(column_tag)] for row in row_elements] max_columns = max(len(row) for row in table_rows) if ignore_colspan: table_rows = filter(lambda row: len(row) == max_columns, table_rows) meta = {'imported_from': 'html', 'filename': filename,} return create_table(table_rows, meta=meta, *args, **kwargs)
def test_create_table_skip_header(self): field_types = OrderedDict([('integer', fields.IntegerField), ('string', fields.UnicodeField),]) data = [['1', 'Álvaro'], ['2', 'turicas'], ['3', 'Justen']] table_1 = create_table(data, fields=field_types, skip_header=True) table_2 = create_table(data, fields=field_types, skip_header=False) self.assertEqual(field_types, table_1.fields) self.assertEqual(table_1.fields, table_2.fields) self.assertEqual(len(table_1), 2) self.assertEqual(len(table_2), 3) first_row = {'integer': 1, 'string': 'Álvaro'} second_row = {'integer': 2, 'string': 'turicas'} third_row = {'integer': 3, 'string': 'Justen'} self.assertEqual(dict(table_1[0]._asdict()), second_row) self.assertEqual(dict(table_2[0]._asdict()), first_row) self.assertEqual(dict(table_1[1]._asdict()), third_row) self.assertEqual(dict(table_2[1]._asdict()), second_row) self.assertEqual(dict(table_2[2]._asdict()), third_row)
def import_from_csv(filename_or_fobj, encoding='utf-8', delimiter=',', quotechar='"', *args, **kwargs): 'Import data from a CSV file' filename, fobj = get_filename_and_fobj(filename_or_fobj) kwargs['encoding'] = encoding csv_reader = unicodecsv.reader(fobj, encoding=encoding, delimiter=str(delimiter), quotechar=str(quotechar)) meta = {'imported_from': 'csv', 'filename': filename,} return create_table(csv_reader, meta=meta, *args, **kwargs)
def test_create_table_skip_header(self): field_types = OrderedDict([ ('integer', fields.IntegerField), ('string', fields.UnicodeField), ]) data = [['1', 'Álvaro'], ['2', 'turicas'], ['3', 'Justen']] table_1 = create_table(data, fields=field_types, skip_header=True) table_2 = create_table(data, fields=field_types, skip_header=False) self.assertEqual(field_types, table_1.fields) self.assertEqual(table_1.fields, table_2.fields) self.assertEqual(len(table_1), 2) self.assertEqual(len(table_2), 3) first_row = {'integer': 1, 'string': 'Álvaro'} second_row = {'integer': 2, 'string': 'turicas'} third_row = {'integer': 3, 'string': 'Justen'} self.assertEqual(dict(table_1[0]._asdict()), second_row) self.assertEqual(dict(table_2[0]._asdict()), first_row) self.assertEqual(dict(table_1[1]._asdict()), third_row) self.assertEqual(dict(table_2[1]._asdict()), second_row) self.assertEqual(dict(table_2[2]._asdict()), third_row)
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): filename, _ = get_filename_and_fobj(filename_or_fobj) book = xlrd.open_workbook(filename, formatting_info=True) if sheet_name is not None: sheet = book.sheet_by_name(sheet_name) else: sheet = book.sheet_by_index(sheet_index) # TODO: may re-use Excel data types # Get field names # TODO: may use sheet.col_values or even sheet.ncols column_count = 0 header = [] column_value = cell_value(sheet, start_row, start_column + column_count) while column_value: header.append(column_value) column_count += 1 column_value = cell_value(sheet, start_row, start_column + column_count) # Get sheet rows # TODO: may use sheel.col_slice or even sheet.nrows table_rows = [] row_count = 0 start_row += 1 cell_is_empty = False while not cell_is_empty: row = [ cell_value(sheet, start_row + row_count, start_column + column_index) for column_index in range(column_count) ] cell_is_empty = not any(row) if not cell_is_empty: table_rows.append(row) row_count += 1 meta = { 'imported_from': 'xls', 'filename': filename, } return create_table([header] + table_rows, meta=meta, *args, **kwargs)
def import_from_html(filename_or_fobj, encoding='utf-8', index=0, ignore_colspan=True, preserve_html=False, row_tag='tr', column_tag='td|th', *args, **kwargs): # TODO: unescape before returning: html_parser.unescape(html) # TODO: lxml -> unicode? filename, fobj = get_filename_and_fobj(filename_or_fobj) kwargs['encoding'] = encoding html = fobj.read().decode(encoding) html_tree = document_fromstring(html) tables = html_tree.xpath('//table') table = tables[index] strip_tags(table, 'thead') strip_tags(table, 'tbody') row_elements = table.xpath(row_tag) if not preserve_html: table_rows = [[ value_element.text_content().strip() for value_element in row.xpath(column_tag) ] for row in row_elements] else: table_rows = [[ _get_content(value_element) for value_element in row.xpath(column_tag) ] for row in row_elements] max_columns = max(len(row) for row in table_rows) if ignore_colspan: table_rows = filter(lambda row: len(row) == max_columns, table_rows) meta = { 'imported_from': 'html', 'filename': filename, } return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): filename, _ = get_filename_and_fobj(filename_or_fobj) book = xlrd.open_workbook(filename, formatting_info=True) if sheet_name is not None: sheet = book.sheet_by_name(sheet_name) else: sheet = book.sheet_by_index(sheet_index) # TODO: may re-use Excel data types # Get field names # TODO: may use sheet.col_values or even sheet.ncols column_count = 0 header = [] column_value = cell_value(sheet, start_row, start_column + column_count) while column_value: header.append(column_value) column_count += 1 column_value = cell_value(sheet, start_row, start_column + column_count) # Get sheet rows # TODO: may use sheel.col_slice or even sheet.nrows table_rows = [] row_count = 0 start_row += 1 cell_is_empty = False while not cell_is_empty: row = [cell_value(sheet, start_row + row_count, start_column + column_index) for column_index in range(column_count)] cell_is_empty = not any(row) if not cell_is_empty: table_rows.append(row) row_count += 1 meta = {'imported_from': 'xls', 'filename': filename,} return create_table([header] + table_rows, meta=meta, *args, **kwargs)