def test_normal_(self, table_name, headers, dup_col_handler, expected): new_tabledata = SQLiteTableDataSanitizer( TableData(table_name, headers, []), dup_col_handler=dup_col_handler ).normalize() print_test_result(expected=dump_tabledata(expected), actual=dump_tabledata(new_tabledata)) assert new_tabledata.equals(expected)
def test_normal(self, table_name, headers, records, expected): new_tabledata = SQLiteTableDataSanitizer( TableData(table_name, headers, records) ).normalize() print_test_result(expected=dump_tabledata(expected), actual=dump_tabledata(new_tabledata)) con = connect_memdb() con.create_table_from_tabledata(new_tabledata) assert con.select_as_tabledata(new_tabledata.table_name) == expected assert new_tabledata.equals(expected)
def test_normal_(self, table_name, headers, dup_col_handler, expected): new_tabledata = SQLiteTableDataSanitizer( TableData(table_name, headers, []), dup_col_handler=dup_col_handler).normalize() try: from pytablewriter import dumps_tabledata print_test_result(expected=dumps_tabledata(expected), actual=dumps_tabledata(new_tabledata)) except ImportError: pass assert new_tabledata.equals(expected)
def normalize_table(self, table_data: TableData, dup_col_handler=None) -> TableData: from pathvalidate import replace_symbol, replace_unprintable_char from simplesqlite import SQLiteTableDataSanitizer if dup_col_handler is None: dup_col_handler = DEFAULT_DUP_COL_HANDLER normalized_table_data = SQLiteTableDataSanitizer( table_data, dup_col_handler=dup_col_handler, is_type_inference=self._is_type_inference, max_workers=self._max_workers, ).normalize() if self._symbol_replace_value is None: return normalized_table_data return TableData( normalized_table_data.table_name, [ replace_symbol( replace_unprintable_char(header), self._symbol_replace_value, is_replace_consecutive_chars=True, is_strip=True, ) for header in normalized_table_data.headers ], normalized_table_data.rows, dp_extractor=normalized_table_data.dp_extractor, type_hints=table_data.dp_extractor.column_type_hints, )
def normalize_table(self, table_data, dup_col_handler=None): from tabledata import TableData from pathvalidate import replace_symbol, replace_unprintable_char from simplesqlite import SQLiteTableDataSanitizer if dup_col_handler is None: dup_col_handler = DEFAULT_DUP_COL_HANDLER normalized_table_data = SQLiteTableDataSanitizer( table_data, dup_col_handler=dup_col_handler ).normalize() if self._symbol_replace_value is None: return normalized_table_data return TableData( table_name=normalized_table_data.table_name, header_list=[ replace_symbol( replace_unprintable_char(header), self._symbol_replace_value, is_replace_consecutive_chars=True, is_strip=True, ) for header in normalized_table_data.header_list ], row_list=normalized_table_data.row_list, dp_extractor=normalized_table_data.dp_extractor, )
def __convert(self, table_data): self.__logger.debug("loaded tabledata: {}".format(six.text_type(table_data))) sqlite_tabledata = SQLiteTableDataSanitizer(table_data).normalize() self.__table_creator.create( sqlite_tabledata, self.__index_list, source_info=self.__source_info ) self.__converted_table_name_set.add(sqlite_tabledata.table_name)
def test_normal(self, table_name, headers, records, expected): new_tabledata = SQLiteTableDataSanitizer( TableData(table_name, headers, records)).normalize() try: from pytablewriter import dumps_tabledata print_test_result(expected=dumps_tabledata(expected), actual=dumps_tabledata(new_tabledata)) except ImportError: pass con = connect_memdb() con.create_table_from_tabledata(new_tabledata) assert con.select_as_tabledata(new_tabledata.table_name) == expected assert new_tabledata.equals(expected)
def gs(ctx, credentials, title, output_path): """ Convert a spreadsheet in Google Sheets to a SQLite database file. CREDENTIALS: OAuth2 Google credentials file. TITLE: Title of the Google Sheets to convert. """ con = create_database(ctx, output_path) verbosity_level = ctx.obj.get(Context.VERBOSITY_LEVEL) schema_extractor = get_schema_extractor(con, verbosity_level) result_counter = ResultCounter() logger = make_logger("{:s} gs".format(PROGRAM_NAME), ctx.obj[Context.LOG_LEVEL]) table_creator = TableCreator(logger=logger, dst_con=con) loader = ptr.GoogleSheetsTableLoader() loader.source = credentials loader.title = title # if typepy.is_null_string(loader.source): # loader.source = app_config_manager.load().get( # ConfigKey.GS_CREDENTIALS_FILE_PATH) try: for table_data in loader.load(): logger.debug(u"loaded table_data: {}".format( six.text_type(table_data))) sqlite_tabledata = SQLiteTableDataSanitizer(table_data).normalize() try: table_creator.create(sqlite_tabledata, ctx.obj.get(Context.INDEX_LIST)) except (ptr.ValidationError, ptr.DataError): result_counter.inc_fail() logger.info( get_success_message( verbosity_level, "google sheets", schema_extractor.get_table_schema_text( sqlite_tabledata.table_name))) except ptr.OpenError as e: logger.error(msgfy.to_error_message(e)) result_counter.inc_fail() except AttributeError: logger.error(u"invalid credentials data: path={}".format(credentials)) result_counter.inc_fail() except (ptr.ValidationError, ptr.DataError) as e: logger.error(u"invalid credentials data: path={}, message={}".format( credentials, str(e))) result_counter.inc_fail() write_completion_message(logger, output_path, result_counter) sys.exit(result_counter.get_return_code())
def __write(self, table_data): from simplesqlite import SQLiteTableDataSanitizer self.__logger.debug(u"loaded tabledata: {}".format(six.text_type(table_data))) sqlite_tabledata = SQLiteTableDataSanitizer(table_data).normalize() self.__table_creator.create(sqlite_tabledata, self.__index_list) self.__result_counter.inc_success() self.__logger.info(get_success_message( self.__verbosity_level, self.__source, self.__schema_extractor.get_table_schema_text(sqlite_tabledata.table_name)))
def __convert(self, table_data: TableData) -> None: self.__logger.debug("loaded tabledata: {}".format(str(table_data))) sqlite_tabledata = SQLiteTableDataSanitizer( table_data, max_workers=self.__max_workers).normalize() self.__table_creator.create( sqlite_tabledata, self.__index_list, source_info=self.__source_info, ) self.__converted_table_name_set.add( cast(str, sqlite_tabledata.table_name))
def test_normal_type_hints(self, table_name, headers, records, type_hints, expecte_col_types, expecte_data): new_tabledata = SQLiteTableDataSanitizer( TableData(table_name, headers, records, type_hints=type_hints)).normalize() actual_col_types = [ col_dp.typename for col_dp in new_tabledata.column_dp_list ] assert actual_col_types == expecte_col_types con = connect_memdb() con.create_table_from_tabledata(new_tabledata) assert con.select_as_tabledata( new_tabledata.table_name) == expecte_data
def test_smoke(self, tmpdir, filename): try: import pytablereader as ptr except ImportError: pytest.skip("requires pytablereader") p = tmpdir.join("tmp.db") con = SimpleSQLite(str(p), "w") test_data_file_path = os.path.join(os.path.dirname(__file__), "data", filename) loader = ptr.TableFileLoader(test_data_file_path) success_count = 0 for table_data in loader.load(): if table_data.is_empty(): continue try: from pytablewriter import dumps_tabledata print(dumps_tabledata(table_data)) except ImportError: pass try: con.create_table_from_tabledata( SQLiteTableDataSanitizer(table_data).normalize()) success_count += 1 except ValueError as e: print(e) con.commit() assert success_count > 0
def url(ctx, url, format_name, output_path, encoding, proxy): """ Scrape tabular data from a URL and convert data to a SQLite database file. """ from ._ipynb_converter import is_ipynb_url, load_ipynb_url, convert_nb if typepy.is_empty_sequence(url): sys.exit(ExitCode.NO_INPUT) con = create_database(ctx, output_path) verbosity_level = ctx.obj.get(Context.VERBOSITY_LEVEL) schema_extractor = get_schema_extractor(con, verbosity_level) result_counter = ResultCounter() logger = make_logger("{:s} url".format(PROGRAM_NAME), ctx.obj[Context.LOG_LEVEL]) if typepy.is_empty_sequence(encoding): encoding = app_config_manager.load().get(ConfigKey.DEFAULT_ENCODING) logger.debug("use default encoding: {}".format(encoding)) if typepy.is_null_string(proxy): proxy = app_config_manager.load().get(ConfigKey.PROXY_SERVER) proxies = { "http": proxy, "https": proxy, } if format_name in IPYNB_FORMAT_NAME_LIST or is_ipynb_url(url): convert_nb(logger, con, result_counter, nb=load_ipynb_url(url, proxies=proxies)) for table_name in con.get_table_name_list(): logger.info( get_success_message( verbosity_level, get_logging_url_path(url), schema_extractor.get_table_schema_text(table_name))) result_counter.inc_success() if result_counter.total_count == 0: table_not_found_msg_format.format(url) else: write_completion_message(logger, output_path, result_counter) sys.exit(result_counter.get_return_code()) try: loader = create_url_loader(logger, url, format_name, encoding, proxies) except ptr.LoaderNotFoundError as e: logger.debug(e) try: loader = create_url_loader(logger, url, "html", encoding, proxies) except ptr.LoaderNotFoundError as e: logger.error(msgfy.to_error_message(e)) sys.exit(ExitCode.FAILED_LOADER_NOT_FOUND) table_creator = TableCreator(logger=logger, dst_con=con) try: for table_data in loader.load(): logger.debug(u"loaded table_data: {}".format( six.text_type(table_data))) sqlite_tabledata = SQLiteTableDataSanitizer(table_data).normalize() try: table_creator.create(sqlite_tabledata, ctx.obj.get(Context.INDEX_LIST)) result_counter.inc_success() except simplesqlite.OperationalError as e: logger.error( u"{:s}: failed to convert: url={}, message={}".format( e.__class__.__name__, url, e.message)) result_counter.inc_fail() continue except ValueError as e: logger.debug(u"{:s}: url={}, message={}".format( e.__class__.__name__, url, str(e))) result_counter.inc_fail() continue logger.info( get_success_message( verbosity_level, get_logging_url_path(url), schema_extractor.get_table_schema_text( sqlite_tabledata.table_name))) except ptr.ValidationError as e: is_fail = True if loader.format_name == "json": dict_converter = DictConverter(logger, table_creator, result_counter, schema_extractor, verbosity_level, source=url, index_list=ctx.obj.get( Context.INDEX_LIST)) try: dict_converter.to_sqlite_table(loader.loader.load_dict(), []) except AttributeError: pass else: is_fail = False if is_fail: logger.error(u"{:s}: url={}, message={}".format( e.__class__.__name__, url, str(e))) result_counter.inc_fail() except ptr.DataError as e: logger.error(u"{:s}: invalid data: url={}, message={}".format( e.__class__.__name__, url, str(e))) result_counter.inc_fail() if result_counter.total_count == 0: logger.warn(table_not_found_msg_format.format(url)) write_completion_message(logger, output_path, result_counter) sys.exit(result_counter.get_return_code())
def file(ctx, files, format_name, output_path, encoding): """ Convert tabular data within CSV/Excel/HTML/JSON/Jupyter Notebook/LTSV/Markdown/Mediawiki/SQLite/SSV/TSV file(s) to a SQLite database file. """ from ._ipynb_converter import is_ipynb_file_path, load_ipynb_file, convert_nb if typepy.is_empty_sequence(files): sys.exit(ExitCode.NO_INPUT) con = create_database(ctx, output_path) verbosity_level = ctx.obj.get(Context.VERBOSITY_LEVEL) schema_extractor = get_schema_extractor(con, verbosity_level) result_counter = ResultCounter() logger = make_logger("{:s} file".format(PROGRAM_NAME), ctx.obj[Context.LOG_LEVEL]) table_creator = TableCreator(logger=logger, dst_con=con) for file_path in files: file_path = path.Path(file_path) if not file_path.isfile(): logger.error(u"file not found: {}".format(file_path)) result_counter.inc_fail() continue if file_path == output_path: logger.warn( u"skip a file which has the same path as the output file ({})". format(file_path)) continue logger.debug(u"converting '{}'".format(file_path)) convert_count = result_counter.total_count if format_name in IPYNB_FORMAT_NAME_LIST or is_ipynb_file_path( file_path): convert_nb(logger, con, result_counter, nb=load_ipynb_file(file_path, encoding=encoding)) for table_name in con.get_table_name_list(): logger.info( get_success_message( verbosity_level, file_path, schema_extractor.get_table_schema_text(table_name))) result_counter.inc_success() if result_counter.total_count == convert_count: table_not_found_msg_format.format(file_path) continue try: loader = ptr.TableFileLoader(file_path, format_name=format_name, encoding=encoding) except ptr.InvalidFilePathError as e: logger.debug(msgfy.to_debug_message(e)) result_counter.inc_fail() continue except ptr.LoaderNotFoundError: logger.debug( u"loader not found that coincide with '{}'".format(file_path)) result_counter.inc_fail() continue try: for table_data in loader.load(): logger.debug(u"loaded tabledata: {}".format( six.text_type(table_data))) sqlite_tabledata = SQLiteTableDataSanitizer( table_data).normalize() try: table_creator.create(sqlite_tabledata, ctx.obj.get(Context.INDEX_LIST)) result_counter.inc_success() except (ValueError, IOError) as e: logger.debug(u"exception={:s}, path={}, message={}".format( type(e).__name__, file_path, e)) result_counter.inc_fail() continue logger.info( get_success_message( verbosity_level, file_path, schema_extractor.get_table_schema_text( sqlite_tabledata.table_name))) except ptr.OpenError as e: logger.error(u"{:s}: open error: file={}, message='{}'".format( e.__class__.__name__, file_path, str(e))) result_counter.inc_fail() except ptr.ValidationError as e: if loader.format_name == "json": dict_converter = DictConverter(logger, table_creator, result_counter, schema_extractor, verbosity_level, source=file_path, index_list=ctx.obj.get( Context.INDEX_LIST)) try: dict_converter.to_sqlite_table(loader.loader.load_dict(), []) except AttributeError: pass else: continue logger.error( u"{:s}: invalid {} data format: path={}, message={}".format( e.__class__.__name__, _get_format_type_from_path(file_path), file_path, str(e))) result_counter.inc_fail() except ptr.DataError as e: logger.error(u"{:s}: invalid {} data: path={}, message={}".format( e.__class__.__name__, _get_format_type_from_path(file_path), file_path, str(e))) result_counter.inc_fail() if result_counter.total_count == convert_count: logger.warn(table_not_found_msg_format.format(file_path)) write_completion_message(logger, output_path, result_counter) sys.exit(result_counter.get_return_code())
def test_exception(self, table_name, headers, expected): with pytest.raises(expected): SQLiteTableDataSanitizer(TableData(table_name, headers, []), dup_col_handler="error").normalize()
def test_exception_invalid_data(self, table_name, headers, records, expected): with pytest.raises(expected): SQLiteTableDataSanitizer(TableData(table_name, headers, records)).normalize()