def test_normal_json(self, url, format_name): responses.add( responses.GET, url, body=dedent( """\ [ {"attr_a": 1}, {"attr_b": 2.1, "attr_c": "bb"} ]""" ), content_type="text/plain; charset=utf-8", status=200, ) expected_list = [ TableData( "url_loader", ["attr_a", "attr_b", "attr_c"], [{"attr_a": 1}, {"attr_b": 2.1, "attr_c": "bb"}], ) ] loader = ptr.TableUrlLoader(url, format_name) assert loader.format_name == "json" loader.table_name = "url_loader" for table_data in loader.load(): print("{} {}".format(table_data, dumps_tabledata(table_data))) print(table_data.rows) print("[expected]") for expected in expected_list: print(dumps_tabledata(expected)) assert table_data.in_tabledata_list(expected_list)
def test_normal_csv(self, url, format_name): responses.add(responses.GET, url, body='''"attr_a","attr_b","attr_c" 1,4,"a" 2,2.1,"bb" 3,120.9,"ccc"''', content_type='text/plain; charset=utf-8', status=200) expeced_list = [ ptr.TableData("csv1", ["attr_a", "attr_b", "attr_c"], [ [1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"], ]) ] loader = ptr.TableUrlLoader(url, format_name) assert loader.format_name == "csv" for tabledata, expected in zip(loader.load(), expeced_list): print("[expected]\n{}".format(ptw.dump_tabledata(expected))) print("[actual]\n{}".format(ptw.dump_tabledata(tabledata))) assert tabledata == expected
def test_normal_json(self, url, format_name): responses.add(responses.GET, url, body=dedent('''\ [ {"attr_a": 1}, {"attr_b": 2.1, "attr_c": "bb"} ]'''), content_type='text/plain; charset=utf-8', status=200) expeced_list = [ TableData("json1", ["attr_a", "attr_b", "attr_c"], [ { 'attr_a': 1 }, { 'attr_b': 2.1, 'attr_c': 'bb' }, ]), ] loader = ptr.TableUrlLoader(url, format_name) assert loader.format_name == "json" for table_data in loader.load(): assert table_data.in_tabledata_list(expeced_list)
def test_normal_excel(self): url = "https://github.com/thombashi/valid/test/data/validdata.xlsx" data_path = os.path.join(os.path.dirname(__file__), "data/validdata.xlsx") with open(data_path, "rb") as f: responses.add( responses.GET, url, body=f.read(), content_type="application/octet-stream", status=200, ) expeced_list = [ TableData( "testsheet1", ["a1", "b1", "c1"], [["aa1", "ab1", "ac1"], [1.0, 1.1, "a"], [2.0, 2.2, "bb"], [3.0, 3.3, "cc"]], ), TableData( "testsheet3", ["a3", "b3", "c3"], [["aa3", "ab3", "ac3"], [4.0, 1.1, "a"], [5.0, "", "bb"], [6.0, 3.3, ""]], ), ] loader = ptr.TableUrlLoader(url) assert loader.format_name == "excel" for table_data in loader.load(): assert table_data.in_tabledata_list(expeced_list)
def test_normal_json(self, url, format_name): responses.add(responses.GET, url, body='''[ {"attr_a": 1}, {"attr_b": 2.1, "attr_c": "bb"} ]''', content_type='text/plain; charset=utf-8', status=200) expeced_list = [ ptr.TableData("json1", ["attr_a", "attr_b", "attr_c"], [ { 'attr_a': 1 }, { 'attr_b': 2.1, 'attr_c': 'bb' }, ]) ] loader = ptr.TableUrlLoader(url, format_name) assert loader.format_name == "json" for tabledata, expected in zip(loader.load(), expeced_list): assert tabledata == expected
def test_normal_csv(self, url, format_name): responses.add( responses.GET, url, body=dedent("""\ "attr_a","attr_b","attr_c" 1,4,"a" 2,2.1,"bb" 3,120.9,"ccc" """), content_type="text/plain; charset=utf-8", status=200, ) expeced_list = [ TableData( "csv1", ["attr_a", "attr_b", "attr_c"], [[1, 4, "a"], [2, "2.1", "bb"], [3, "120.9", "ccc"]], ) ] loader = ptr.TableUrlLoader(url, format_name) assert loader.format_name == "csv" for table_data in loader.load(): assert table_data.in_tabledata_list(expeced_list)
def test_exception(self, value, format_name, expected): responses.add(responses.GET, value, body="""404: Not Found""", status=404) with pytest.raises(expected): ptr.TableUrlLoader(value, format_name)
def create_url_loader(logger, source_url, format_name, encoding, proxies): try: return ptr.TableUrlLoader(source_url, format_name, encoding=encoding, proxies=proxies) except (ptr.HTTPError, ptr.UrlError) as e: logger.error(msgfy.to_error_message(e)) sys.exit(ExitCode.FAILED_HTTP) except ptr.ProxyError as e: logger.error(msgfy.to_error_message(e)) sys.exit(errno.ECONNABORTED)
def test_normal(self, value, format_name, expected): responses.add(responses.GET, value, body='''{}''', content_type='text/plain; charset=utf-8', status=200) loader = ptr.TableUrlLoader(value, format_name) expected_loader = expected("") assert loader.source_type == expected_loader.source_type assert loader.format_name == expected_loader.format_name
def create_url_loader(logger, url, format_name, encoding, proxies): try: return ptr.TableUrlLoader(url, format_name, encoding=encoding, proxies=proxies) except ptr.HTTPError as e: logger.error(e) sys.exit(ExitCode.FAILED_HTTP) except ptr.ProxyError as e: logger.error(e) sys.exit(errno.ECONNABORTED)
def create_url_loader(logger, source_url, format_name, encoding, proxies): try: return ptr.TableUrlLoader(source_url, format_name, encoding=encoding, proxies=proxies) except ptr.HTTPError as e: logger.error("{:s}: {}".format(e.__class__.__name__, e)) sys.exit(ExitCode.FAILED_HTTP) except ptr.ProxyError as e: logger.error("{:s}: {}".format(e.__class__.__name__, e)) sys.exit(errno.ECONNABORTED)
def create_url_loader( logger, source_url: str, format_name: str, encoding: str, type_hint_rules: Optional[TypeHintRules], proxies: Optional[Dict], ) -> AbstractTableReader: try: return ptr.TableUrlLoader( source_url, format_name, encoding=encoding, type_hint_rules=type_hint_rules, proxies=proxies, ) except (ptr.HTTPError, ptr.UrlError) as e: logger.error(msgfy.to_error_message(e)) sys.exit(ExitCode.FAILED_HTTP) except ptr.ProxyError as e: logger.error(msgfy.to_error_message(e)) sys.exit(errno.ECONNABORTED)
def test_normal_excel(self): url = 'https://github.com/thombashi/valid/test/data/validdata.xlsx' data_path = os.path.join(os.path.dirname(__file__), "data/validdata.xlsx") with open(data_path, "rb") as f: responses.add(responses.GET, url, body=f.read(), content_type='application/octet-stream', status=200) expeced_list = [ ptr.TableData(table_name='testsheet1', header_list=['a1', 'b1', 'c1'], record_list=[ ['aa1', 'ab1', 'ac1'], [1.0, 1.1, 'a'], [2.0, 2.2, 'bb'], [3.0, 3.3, 'cc'], ]), ptr.TableData(table_name='testsheet3', header_list=['a3', 'b3', 'c3'], record_list=[ ['aa3', 'ab3', 'ac3'], [4.0, 1.1, 'a'], [5.0, '', 'bb'], [6.0, 3.3, ''], ]), ] loader = ptr.TableUrlLoader(url) assert loader.format_name == "excel" for tabledata, expected in zip(loader.load(), expeced_list): assert tabledata == expected
def test_normal_sqlite(self): url = "https://github.com/thombashi/valid/test/data/valid.sqlite3" data_path = os.path.join(dirname(dirname(__file__)), "data/valid.sqlite3") with open(data_path, "rb") as f: responses.add( responses.GET, url, body=f.read(), content_type="application/octet-stream", status=200, ) loader = ptr.TableUrlLoader(url) assert loader.format_name == "sqlite" for table_data in loader.load(): assert table_data == TableData( "tblfaker", ["file_extension", "random_number"], [["webm", 679215], ["jpg", 5088743], ["avi", 8268]], )
#!/usr/bin/env python3 import pytablewriter as ptw import pytablereader as ptr loader = ptr.TableUrlLoader( "https://en.wikipedia.org/wiki/List_of_unit_testing_frameworks", "html") writer = ptw.TableWriterFactory.create_from_format_name("rst") writer.stream = open("load_url_result.rst", "w", encoding=loader.encoding) for table_data in loader.load(): writer.from_tabledata(table_data) writer.write_table()
def url(ctx, url, format_name, output_path, encoding, proxy): """ Fetch data from a URL and convert data to a SQLite database file. """ if dataproperty.is_empty_sequence(url): sys.exit(ExitCode.NO_INPUT) con = create_database(ctx, output_path) verbosity_level = ctx.obj.get(Context.VERBOSITY_LEVEL) extractor = get_schema_extractor(con, verbosity_level) result_counter = ResultCounter() logger = logbook.Logger("sqlitebiter url") _setup_logger_from_context(logger, ctx.obj[Context.LOG_LEVEL]) proxies = {} if dataproperty.is_not_empty_string(proxy): proxies = { "http": proxy, "https": proxy, } try: loader = ptr.TableUrlLoader(url, format_name, encoding=encoding, proxies=proxies) except ptr.LoaderNotFoundError as e: try: loader = ptr.TableUrlLoader(url, "html", encoding=encoding, proxies=proxies) except (ptr.LoaderNotFoundError, ptr.HTTPError): logger.error(e) sys.exit(ExitCode.FAILED_LOADER_NOT_FOUND) except ptr.HTTPError as e: logger.error(e) sys.exit(ExitCode.FAILED_HTTP) try: for tabledata in loader.load(): sqlite_tabledata = ptr.SQLiteTableDataSanitizer( tabledata).sanitize() try: con.create_table_from_tabledata(sqlite_tabledata) result_counter.inc_success() except (ValueError) as e: logger.debug(u"url={}, message={}".format(url, str(e))) result_counter.inc_fail() continue log_message = get_success_log_format(verbosity_level).format( url, extractor.get_table_schema_text( sqlite_tabledata.table_name).strip()) logger.info(log_message) except ptr.InvalidDataError as e: logger.error(u"invalid data: url={}, message={}".format(url, str(e))) result_counter.inc_fail() write_completion_message(logger, output_path, result_counter) sys.exit(result_counter.get_return_code())
if show_flags: for country in sorted(data['total_country']): total_country_flags.append(get_country_flag_emoji(country)) print("Involved in " + str(number_of_events) + " events in " + str(len(data['total_country'])) + " countries.") print(" ".join(total_country_flags)) if __name__ == "__main__": latest_year = 2019 stats_pr_year = defaultdict(list) stats_total = defaultdict(list) current_year = latest_year loader = ptr.TableUrlLoader("http://localhost:4000/talks/", "html") writer = ptw.TableWriterFactory.create_from_format_name("md") for table_data in loader.load(): country_city_pr_year = [] country_pr_year = [] city_pr_year = [] for record in table_data.row_list: country_city_pr_year.append(record[1]) parsed = record[1].split(",") country_pr_year.append(parsed[0]) city_pr_year.append(parsed[1]) writer.from_tabledata(table_data)