def download_organizations(): "Download organizations JSON and extract its properties" response = requests.get(URL) data = response.json() organizations = [organization["properties"] for organization in data["features"]] return rows.import_from_dicts(organizations)
def test_import_from_dicts_return_desired_data(self): table = rows.import_from_dicts(self.data) self.assertEqual(len(table), 3) self.assertEqual(len(table.fields), 4) self.assertEqual( set(table.field_names), set(["ids", "name", "number", "other"]) ) self.assertEqual(table.fields["name"], rows.fields.TextField) self.assertEqual(table.fields["ids"], rows.fields.TextField) self.assertEqual(table.fields["number"], rows.fields.IntegerField) self.assertEqual(table.fields["other"], rows.fields.FloatField) self.assertEqual(table[0].name, "Álvaro") self.assertEqual(table[0].ids, "123") self.assertEqual(table[0].number, 3) self.assertEqual(table[0].other, None) self.assertEqual(table[1].name, "Test") self.assertEqual(table[1].ids, "456") self.assertEqual(table[1].number, None) self.assertEqual(table[1].other, None) self.assertEqual(table[2].name, "Python") self.assertEqual(table[2].ids, "123, 456") self.assertEqual(table[2].number, None) self.assertEqual(table[2].other, 3.14)
def download_organizations(): 'Download organizations JSON and extract its properties' page = urllib.urlopen(URL) data = json.loads(page.read()) organizations = [organization['properties'] for organization in data['features']] return rows.import_from_dicts(organizations)
def test_export_callback(self): table = rows.import_from_dicts([{"id": number} for number in range(10)]) myfunc = mock.Mock() rows.export_to_sqlite(table, ":memory:", callback=myfunc, batch_size=3) self.assertEqual(myfunc.call_count, 4) self.assertEqual( [(x[0][0], x[0][1]) for x in myfunc.call_args_list], [(3, 3), (3, 6), (3, 9), (1, 10)], )
def test_import_field_limit(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = "{}.{}".format(temp.name, self.file_extension) self.files_to_delete.append(filename) table = rows.import_from_dicts([{"f1": "a" * 132000}]) rows.export_to_csv(table, filename) # The following line must not raise the exception: # `_csv.Error: field larger than field limit (131072)` new = rows.import_from_csv(filename)
def test_export_to_dicts(self): table = rows.import_from_dicts(self.data) result = rows.export_to_dicts(table) full_data = [ {"name": "Álvaro", "ids": "123", "number": 3, "other": None}, {"name": "Test", "ids": "456", "number": None, "other": None}, {"name": "Python", "ids": "123, 456", "number": None, "other": 3.14}, ] self.assertEqual(len(result), len(table)) for expected, actual in zip(full_data, result): self.assertDictEqual(expected, actual)
def test_import_from_dicts_accepts_generator(self): max_size = 1000 samples = 200 generator = utils.LazyDictGenerator(max_size) datagen = iter(generator) table = rows.import_from_dicts(datagen, lazy=True, samples=samples) # `create_table` will consume the whole generator self.assertEqual(generator.last, max_size - 1) data = list(table) self.assertTrue(len(data), max_size) self.assertEqual(generator.last, max_size - 1)
def test_import_from_dicts_accepts_generator(self): max_size = 1000 samples = 200 generator = utils.LazyDictGenerator(max_size) datagen = iter(generator) table = rows.import_from_dicts(datagen, lazy=True, samples=samples) # `create_table` will consume the whole generator self.assertEqual(generator.last, max_size - 1) data = list(table) self.assertTrue(len(data), max_size) self.assertEqual(generator.last, max_size - 1)
def test_import_from_dicts_maintains_header_order(self): headers = list(string.ascii_lowercase) random.shuffle(headers) data = [ OrderedDict([(header, 1) for header in headers]), OrderedDict([(header, 2) for header in headers]), OrderedDict([(header, 3) for header in headers]), OrderedDict([(header, 4) for header in headers]), OrderedDict([(header, 5) for header in headers]), ] table = rows.import_from_dicts(data) self.assertEqual(table.field_names, headers)
def test_import_from_dicts_uses_create_table(self, mocked_create_table): mocked_create_table.return_value = 42 kwargs = {'some_key': 123, 'other': 456, } result = rows.import_from_dicts(self.data, **kwargs) self.assertTrue(mocked_create_table.called) self.assertEqual(mocked_create_table.call_count, 1) self.assertEqual(result, 42) call = mocked_create_table.call_args kwargs['meta'] = {'imported_from': 'dicts', } self.assertEqual(call[1], kwargs)
def test_import_from_dicts_maintains_header_order(self): headers = list(string.ascii_lowercase) random.shuffle(headers) data = [ OrderedDict([(header, 1) for header in headers]), OrderedDict([(header, 2) for header in headers]), OrderedDict([(header, 3) for header in headers]), OrderedDict([(header, 4) for header in headers]), OrderedDict([(header, 5) for header in headers]), ] table = rows.import_from_dicts(data) self.assertEqual(table.field_names, headers)
def test_import_from_dicts_uses_create_table(self, mocked_create_table): mocked_create_table.return_value = 42 kwargs = {'some_key': 123, 'other': 456, } result = rows.import_from_dicts(self.data, **kwargs) self.assertTrue(mocked_create_table.called) self.assertEqual(mocked_create_table.call_count, 1) self.assertEqual(result, 42) call = mocked_create_table.call_args kwargs['meta'] = {'imported_from': 'dicts', } self.assertEqual(call[1], kwargs)
def execute(self, year, action): logging.info(f"[Budget-CE] Starting for {year} and action {action}") self.select_year(year) self.select_month( "Dezembro") # December has the cumulative for the year self.select_action(action) self.select_modality_91("TUDO") self.select_report("Outros", "PA") filename = self.do_search() result = self.parse_budget(filename, year, action) for row in rows.import_from_dicts(result): yield row._asdict()
def extract_boletim(state, data): table = rows.import_from_dicts( data, force_types={ "date": rows.fields.DateField, "notes": rows.fields.TextField, "state": rows.fields.TextField, "url": rows.fields.TextField, }, ) for row in table: row = row._asdict() yield row
def test_import_from_dicts_uses_create_table(self, mocked_create_table): mocked_create_table.return_value = 42 kwargs = {"some_key": 123, "other": 456} result = rows.import_from_dicts(self.data, **kwargs) self.assertTrue(mocked_create_table.called) self.assertEqual(mocked_create_table.call_count, 1) self.assertEqual(result, 42) call = mocked_create_table.call_args kwargs["meta"] = {"imported_from": "dicts"} kwargs["samples"] = None self.assertEqual(call[1], kwargs)
def csv_to_sqlite( input_filename, output_filename, samples=None, dialect=None, batch_size=10000, encoding="utf-8", callback=None, force_types=None, chunk_size=8388608, table_name="table1", schema=None, ): "Export a CSV file to SQLite, based on field type detection from samples" # TODO: automatically detect encoding if encoding == `None` # TODO: should be able to specify fields if dialect is None: # Get a sample to detect dialect fobj = open_compressed(input_filename, mode="rb") sample = fobj.read(chunk_size) dialect = rows.plugins.csv.discover_dialect(sample, encoding=encoding) elif isinstance(dialect, six.text_type): dialect = csv.get_dialect(dialect) if schema is None: # Identify data types fobj = open_compressed(input_filename, encoding=encoding) data = list(islice(csv.DictReader(fobj, dialect=dialect), samples)) schema = rows.import_from_dicts(data).fields if force_types is not None: schema.update(force_types) # Create lazy table object to be converted # TODO: this lazyness feature will be incorported into the library soon so # we can call here `rows.import_from_csv` instead of `csv.reader`. reader = csv.reader( open_compressed(input_filename, encoding=encoding), dialect=dialect ) header = make_header(next(reader)) # skip header table = rows.Table(fields=OrderedDict([(field, schema[field]) for field in header])) table._rows = reader # Export to SQLite return rows.export_to_sqlite( table, output_filename, table_name=table_name, batch_size=batch_size, callback=callback, )
def test_import_from_dicts_uses_create_table(self, mocked_create_table): mocked_create_table.return_value = 42 kwargs = {"some_key": 123, "other": 456} result = rows.import_from_dicts(self.data, **kwargs) self.assertTrue(mocked_create_table.called) self.assertEqual(mocked_create_table.call_count, 1) self.assertEqual(result, 42) call = mocked_create_table.call_args kwargs["meta"] = {"imported_from": "dicts"} kwargs["samples"] = None self.assertEqual(call[1], kwargs)
def pdf_to_csv(input_filename, output_filename): total_pages = rows.plugins.pdf.number_of_pages(input_filename) pdf = rows.plugins.pdf.PyMuPDFBackend(input_filename) result = [] for page_number in range(1, total_pages + 1): page = list(next(pdf.objects(page_numbers=(page_number, )))) data = list(rows.plugins.utils.ipartition(page, 4)) header = [obj.text for obj in data[0]] for row in data[1:]: row = dict(zip(header, [obj.text for obj in row])) row["codigo_ibge"] = row.pop("IBGE") row["perfil"] = row.pop("Perfil Município") result.append(row) table = rows.import_from_dicts(result) rows.export_to_csv(table, output_filename)
def parse_file(filename): """Parse Amazonas' PDF file containing state employee information""" total_pages = rows.plugins.pdf.number_of_pages(filename) result = [] for page in range(1, total_pages + 1): table = rows.import_from_pdf( filename, page_numbers=(page, ), starts_after="NOME", fields=PDF_FIELD_TYPES, skip_header=True, ) for row in table: result.append(convert_row(row)) return rows.import_from_dicts(result)
def convert(state, input_filename, output_filename): table = rows.import_from_csv( input_filename, force_types={ "confirmed": rows.fields.IntegerField, "deaths": rows.fields.IntegerField, }, ) state_cities = ["TOTAL NO ESTADO", "Importados/Indefinidos"] + sorted( row.municipio for row in cities if row.uf == state ) confirmed, deaths, dates = {}, {}, [] for row in table: row_confirmed = row.confirmed or 0 row_date = row.date row_deaths = row.deaths or 0 row_name = row.city if row.place_type == "city" else "TOTAL NO ESTADO" if row_name not in state_cities: print(f"ERRO: município {repr(row_name)} não encontrado.") continue if row_confirmed == 0 and row_deaths == 0: # No data for this city in this day continue if row_date not in confirmed: confirmed[row_date] = {} if row_date not in deaths: deaths[row_date] = {} if row_name in confirmed[row_date] or row_name in deaths[row_date]: print(f"ERRO: conflito em {repr(row_name)} para {row_date}.") continue confirmed[row_date][row_name] = row_confirmed deaths[row_date][row_name] = row_deaths result = [] dates = sorted(confirmed.keys(), reverse=True) for city in state_cities: row = {"municipio": city} for date in dates: date_str = f"{date.day:02d}_{date.month:02d}" row[f"confirmados_{date_str}"] = confirmed[date].get(city, None) row[f"mortes_{date_str}"] = deaths[date].get(city, None) result.append(row) rows.export_to_csv(rows.import_from_dicts(result), output_filename)
def parse_boletim(self, state, data): self.logger.info(f"Parsing {state} boletim") try: reports = rows.import_from_dicts( data, force_types={ "date": rows.fields.DateField, "notes": rows.fields.TextField, "state": rows.fields.TextField, "url": rows.fields.TextField, }, ) except Exception as exp: self.errors[state].append(("boletim", state, f"{exp.__class__.__name__}: {exp}")) return for report in reports: report = report._asdict() self.logger.debug(report) self.boletim_writer.writerow(report)
def main(): csv_filename = pathlib.Path('data/names.csv') output = pathlib.Path('output/names-stats.csv') if not csv_filename.parent.exists(): os.makedirs(str(csv_filename.parent.absolute()), exist_ok=True) if not output.parent.exists(): os.makedirs(str(output.parent.absolute()), exist_ok=True) requests_cache.install_cache('nomes-ibge') result = [] for name in unique_names(csv_filename): print(name) row = download_name_stats(name) if row is None: continue row['alternative_names'] = '|'.join(row['alternative_names']) result.append(row) table = rows.import_from_dicts(result) table.order_by('name') rows.utils.export_to_uri(table, str(output.absolute()))
def test_export_to_dicts(self): table = rows.import_from_dicts(self.data) result = rows.export_to_dicts(table) full_data = [ {'name': 'Álvaro', 'ids': '123', 'number': 3, 'other': None, }, {'name': 'Test', 'ids': '456', 'number': None, 'other': None, }, {'name': 'Python', 'ids': '123, 456', 'number': None, 'other': 3.14, },] self.assertEqual(len(result), len(table)) for expected, actual in zip(full_data, result): self.assertDictEqual(expected, actual)
def test_export_to_dicts(self): table = rows.import_from_dicts(self.data) result = rows.export_to_dicts(table) full_data = [ {'name': 'Álvaro', 'ids': '123', 'number': 3, 'other': None, }, {'name': 'Test', 'ids': '456', 'number': None, 'other': None, }, {'name': 'Python', 'ids': '123, 456', 'number': None, 'other': 3.14, },] self.assertEqual(len(result), len(table)) for expected, actual in zip(full_data, result): self.assertDictEqual(expected, actual)
def csv2sqlite( input_filename, output_filename, table_name, samples=30000, batch_size=10000, encoding="utf-8", callback=None, force_types=None, ): # Identify data types fobj = open_compressed(input_filename, encoding) reader = csv.reader(fobj) header = next(reader) data = [] for index, row in enumerate(reader): row = dict(zip(header, row)) if index == samples: break data.append(row) fields = rows.import_from_dicts(data, import_fields=header).fields if force_types is not None: fields.update(force_types) # Create lazy table object to be converted table = rows.Table(fields=fields) reader = csv.reader(open_compressed(input_filename, encoding)) next(reader) # skip header table._rows = reader # Export to SQLite rows.export_to_sqlite(table, output_filename, table_name=table_name, callback=callback, batch_size=batch_size)
def parse_state_file(self, response): state = response.meta["state"] if response.status >= 400: self.errors[state].append( ("connection", state, f"HTTP status code: {response.status}") ) else: response_data = json.load(io.BytesIO(response.body)) try: self.parse_boletim(state, response_data["reports"]) except Exception as exp: self.errors[state].append( ("boletim", state, f"{exp.__class__.__name__}: {exp}") ) try: self.parse_caso(state, response_data["cases"]) except Exception as exp: self.errors[state].append( ("caso", state, f"{exp.__class__.__name__}: {exp}") ) if self.errors[state]: error_counter = Counter(error[0] for error in self.errors[state]) error_counter_str = ", ".join( f"{error_type}: {count}" for error_type, count in error_counter.items() ) self.logger.error( f"{len(self.errors[state])} errors found when parsing {state} ({error_counter_str})" ) error_header = ("sheet", "state", "message") errors = rows.import_from_dicts( [dict(zip(error_header, row)) for row in self.errors[state]] ) filename = ERROR_PATH / f"errors-{state}.csv" if not filename.parent.exists(): filename.parent.mkdir(parents=True) rows.export_to_csv(errors, filename)
def test_import_from_dicts_return_desired_data(self): table = rows.import_from_dicts(self.data) self.assertEqual(len(table), 3) self.assertEqual(len(table.fields), 4) self.assertEqual(set(table.field_names), set(["ids", "name", "number", "other"])) self.assertEqual(table.fields["name"], rows.fields.TextField) self.assertEqual(table.fields["ids"], rows.fields.TextField) self.assertEqual(table.fields["number"], rows.fields.IntegerField) self.assertEqual(table.fields["other"], rows.fields.FloatField) self.assertEqual(table[0].name, "Álvaro") self.assertEqual(table[0].ids, "123") self.assertEqual(table[0].number, 3) self.assertEqual(table[0].other, None) self.assertEqual(table[1].name, "Test") self.assertEqual(table[1].ids, "456") self.assertEqual(table[1].number, None) self.assertEqual(table[1].other, None) self.assertEqual(table[2].name, "Python") self.assertEqual(table[2].ids, "123, 456") self.assertEqual(table[2].number, None) self.assertEqual(table[2].other, 3.14)
def test_import_from_dicts_return_desired_data(self): table = rows.import_from_dicts(self.data) self.assertEqual(len(table), 3) self.assertEqual(len(table.fields), 4) self.assertEqual(set(table.field_names), set(['ids', 'name', 'number', 'other'])) self.assertEqual(table.fields['name'], rows.fields.TextField) self.assertEqual(table.fields['ids'], rows.fields.TextField) self.assertEqual(table.fields['number'], rows.fields.IntegerField) self.assertEqual(table.fields['other'], rows.fields.FloatField) self.assertEqual(table[0].name, 'Álvaro') self.assertEqual(table[0].ids, '123') self.assertEqual(table[0].number, 3) self.assertEqual(table[0].other, None) self.assertEqual(table[1].name, 'Test') self.assertEqual(table[1].ids, '456') self.assertEqual(table[1].number, None) self.assertEqual(table[1].other, None) self.assertEqual(table[2].name, 'Python') self.assertEqual(table[2].ids, '123, 456') self.assertEqual(table[2].number, None) self.assertEqual(table[2].other, 3.14)
def test_import_from_dicts_return_desired_data(self): table = rows.import_from_dicts(self.data) self.assertEqual(len(table), 3) self.assertEqual(len(table.fields), 4) self.assertEqual(set(table.field_names), set(['ids', 'name', 'number', 'other'])) self.assertEqual(table.fields['name'], rows.fields.TextField) self.assertEqual(table.fields['ids'], rows.fields.TextField) self.assertEqual(table.fields['number'], rows.fields.IntegerField) self.assertEqual(table.fields['other'], rows.fields.FloatField) self.assertEqual(table[0].name, 'Álvaro') self.assertEqual(table[0].ids, '123') self.assertEqual(table[0].number, 3) self.assertEqual(table[0].other, None) self.assertEqual(table[1].name, 'Test') self.assertEqual(table[1].ids, '456') self.assertEqual(table[1].number, None) self.assertEqual(table[1].other, None) self.assertEqual(table[2].name, 'Python') self.assertEqual(table[2].ids, '123, 456') self.assertEqual(table[2].number, None) self.assertEqual(table[2].other, 3.14)
def to_csv(self, path: Path) -> Path: data = ( {"date": key, "value": self.data[key]} for key in sorted(self.data.keys()) ) export_to_csv(import_from_dicts(data), path) return path
def pgimport( filename, database_uri, table_name, encoding="utf-8", dialect=None, create_table=True, schema=None, callback=None, timeout=0.1, chunk_size=8388608, max_samples=10000, ): """Import data from CSV into PostgreSQL using the fastest method Required: psql command """ fobj = open_compressed(filename, mode="r", encoding=encoding) sample = fobj.read(chunk_size) if dialect is None: # Detect dialect dialect = rows.plugins.csv.discover_dialect( sample.encode(encoding), encoding=encoding ) elif isinstance(dialect, six.text_type): dialect = csv.get_dialect(dialect) if schema is None: # Detect field names reader = csv.reader(io.StringIO(sample), dialect=dialect) field_names = [slug(field_name) for field_name in next(reader)] else: field_names = list(schema.keys()) if create_table: if schema is None: data = [ dict(zip(field_names, row)) for row in itertools.islice(reader, max_samples) ] table = rows.import_from_dicts(data) field_types = [table.fields[field_name] for field_name in field_names] else: field_types = list(schema.values()) columns = [ "{} {}".format(name, POSTGRESQL_TYPES.get(type_, DEFAULT_POSTGRESQL_TYPE)) for name, type_ in zip(field_names, field_types) ] create_table = SQL_CREATE_TABLE.format( table_name=table_name, field_types=", ".join(columns) ) execute_command(get_psql_command(create_table, database_uri=database_uri)) # Prepare the `psql` command to be executed based on collected metadata command = get_psql_copy_command( database_uri=database_uri, dialect=dialect, direction="FROM", encoding=encoding, header=field_names, table_name=table_name, ) rows_imported, error = 0, None fobj = open_compressed(filename, mode="rb") try: process = subprocess.Popen( shlex.split(command), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) data = fobj.read(chunk_size) total_written = 0 while data != b"": written = process.stdin.write(data) total_written += written if callback: callback(written, total_written) data = fobj.read(chunk_size) stdout, stderr = process.communicate() if stderr != b"": raise RuntimeError(stderr.decode("utf-8")) rows_imported = int(stdout.replace(b"COPY ", b"").strip()) except FileNotFoundError: raise RuntimeError("Command `psql` not found") except BrokenPipeError: raise RuntimeError(process.stderr.read().decode("utf-8")) return {"bytes_written": total_written, "rows_imported": rows_imported}
import argparse import os import rows import rows.utils from ba_parse_pdf import extract_table as ba_extract_table from sc_parse_pdf import extract_table as sc_extract_table extract_table_functions = {"BA": ba_extract_table, "SC": sc_extract_table} parser = argparse.ArgumentParser() parser.add_argument("state", choices=["BA", "SC"]) parser.add_argument("input_uri") parser.add_argument("output_filename") args = parser.parse_args() input_uri, delete = args.input_uri, False if input_uri.lower().startswith("http://") or input_uri.lower().startswith( "https://"): source = rows.utils.download_file(input_uri, progress=True, detect=False) input_uri, delete = source.uri, True data = extract_table_functions[args.state](input_uri) table = rows.import_from_dicts(data) rows.export_to_csv(table, args.output_filename) if delete: os.unlink(input_uri)
if 'db.sqlite3' not in os.listdir(settings.BASE_DIR): os.system(""" rm -rf db.sqlite3 && touch db.sqlite3 && python manage.py makemigrations python manage.py migrate """) arquivo = open('db.json', mode='r').read() data = json.loads(arquivo) for (k, v) in data.items(): print('\n', k) if type(v).__name__ == 'list': data = rows.import_from_dicts(v) for row in data: if k == 'calendario_temporada': try: pais = Pais.objects.get(pais=row.localizacao['pais']) except Pais.DoesNotExist: pais = Pais.objects.create( pais=row.localizacao['pais'] ) try: cidade = Cidade.objects.get(cidade=row.localizacao['cidade']) except Cidade.DoesNotExist: cidade = Cidade.objects.create( cidade=row.localizacao['cidade'],
def test_extract_table_3(): expected = rows.import_from_csv(DATA_PATH / "expected_3.csv") result = rows.import_from_dicts(extract_table(DATA_PATH / "example_3.pdf")) assert_equal(result, expected)
def test_extract_table_2(): expected = [] result = rows.import_from_dicts( extract_table(DATA_PATH / "example_2.pdf")[: len(expected)] ) assert_equal(result, expected)
import os import rows properties_of_interest = [] for result in os.listdir("results"): results = rows.import_from_csv("results/" + result) properties = [ p._asdict() for p in results if p.for_rent and p.property_type == "casa" and p.rent_price <= 1500 and "campinas" in p.city.lower() ] properties_of_interest.extend(properties) table = rows.import_from_dicts(properties_of_interest) rows.export_to_csv(table, "properties_of_interest_2.csv")
def convert_names(names, csv_output): table = rows.import_from_dicts( [name_ibge(name) for name in names if name_ibge(name)]) rows.export_to_csv(table, csv_output)
def export(path: Path = DEFAULT_EXPORT_FILE) -> None: """Export all data to CSV.""" table = import_from_dicts(data()) export_to_csv(table, path)
def test_export_callback(self): table = rows.import_from_dicts([{"id": number} for number in range(10)]) myfunc = mock.Mock() rows.export_to_csv(table, callback=myfunc, batch_size=3) self.assertEqual(myfunc.call_count, 4) self.assertEqual([x[0][0] for x in myfunc.call_args_list], [3, 6, 9, 10])
def pgimport(filename, database_uri, table_name, encoding='utf-8', create_table=True, progress=False, timeout=0.1, chunk_size=8388608, max_samples=10000): """Import data from CSV into PostgreSQL using the fastest method Required: psql command """ # Extract a sample from the CSV to detect its dialect and header fobj = open_compressed(filename, mode='r', encoding=encoding) sample = fobj.read(chunk_size).encode(encoding) dialect = rows.plugins.csv.discover_dialect(sample, encoding=encoding) reader = csv.reader(io.StringIO(sample.decode(encoding))) field_names = [slug(field_name) for field_name in next(reader)] if create_table: data = [ dict(zip(field_names, row)) for row in itertools.islice(reader, max_samples) ] table = rows.import_from_dicts(data) field_types = [table.fields[field_name] for field_name in field_names] columns = [ '{} {}'.format( name, POSTGRESQL_TYPES.get(type_, DEFAULT_POSTGRESQL_TYPE)) for name, type_ in zip(field_names, field_types) ] create_table = SQL_CREATE_TABLE.format( table_name=table_name, field_types=', '.join(columns), ) execute_command( get_psql_command(create_table, database_uri=database_uri)) # Prepare the `psql` command to be executed based on collected metadata command = get_psql_copy_command( database_uri=database_uri, table_name=table_name, header=field_names, dialect=dialect, encoding=encoding, ) rows_imported, error, total_size = 0, None, None try: total_size = uncompressed_size(filename) except (RuntimeError, ValueError): pass if progress: progress_bar = tqdm( desc='Importing data', unit='bytes', unit_scale=True, unit_divisor=1024, total=total_size, ) fobj = open_compressed(filename, mode='rb') try: process = subprocess.Popen( shlex.split(command), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) data = fobj.read(chunk_size) while data != b'': data_written = process.stdin.write(data) if progress: progress_bar.update(data_written) data = fobj.read(chunk_size) stdout, stderr = process.communicate() if stderr != b'': raise RuntimeError(stderr.decode('utf-8')) rows_imported = int(stdout.replace(b'COPY ', b'').strip()) except FileNotFoundError: raise RuntimeError('Command `psql` not found') except BrokenPipeError: raise RuntimeError(process.stderr.read().decode('utf-8')) if progress: progress_bar.close() return rows_imported
def to_csv(self, path: Path) -> Path: """Export the adapter's data to a CSV file.""" table = import_from_dicts(self.export()) export_to_csv(table, path) return path
def process_item(self, item, spider): row = import_from_dicts([dict(item)]) logger.debug(row) export_to_sqlite(row, self.conn, self.table) return item