def export_to_sqlite( table, filename_or_connection, table_name=None, table_name_format="table{index}", batch_size=100, callback=None, *args, **kwargs ): # TODO: should add transaction support? prepared_table = prepare_to_export(table, *args, **kwargs) connection = _get_connection(filename_or_connection) cursor = connection.cursor() if table_name is None: table_names = [item[0] for item in cursor.execute(SQL_TABLE_NAMES)] table_name = make_unique_name( table_name_format.format(index=1), existing_names=table_names, name_format=table_name_format, start=1, ) elif not _valid_table_name(table_name): raise ValueError("Invalid table name: {}".format(table_name)) field_names = next(prepared_table) field_types = list(map(table.fields.get, field_names)) columns = [ "{} {}".format(field_name, SQLITE_TYPES.get(field_type, DEFAULT_TYPE)) for field_name, field_type in zip(field_names, field_types) ] cursor.execute( SQL_CREATE_TABLE.format(table_name=table_name, field_types=", ".join(columns)) ) insert_sql = SQL_INSERT.format( table_name=table_name, field_names=", ".join(field_names), placeholders=", ".join("?" for _ in field_names), ) _convert_row = _python_to_sqlite(field_types) if callback is None: for batch in ipartition(prepared_table, batch_size): cursor.executemany(insert_sql, map(_convert_row, batch)) else: total_written = 0 for batch in ipartition(prepared_table, batch_size): cursor.executemany(insert_sql, map(_convert_row, batch)) written = len(batch) total_written += written callback(written, total_written) connection.commit() return connection
def test_ipartition(self): iterable = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] result = plugins_utils.ipartition(iterable, 3) self.assertEqual(type(result), types.GeneratorType) self.assertEqual(list(result), [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]]) result = plugins_utils.ipartition(iterable, 2) self.assertEqual(type(result), types.GeneratorType) self.assertEqual(list(result), [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
def export_to_sqlite(table, filename_or_connection, table_name=None, table_name_format="table{index}", batch_size=100, callback=None, *args, **kwargs): # TODO: should add transaction support? prepared_table = prepare_to_export(table, *args, **kwargs) connection = _get_connection(filename_or_connection) cursor = connection.cursor() if table_name is None: table_names = [item[0] for item in cursor.execute(SQL_TABLE_NAMES)] table_name = make_unique_name( table_name_format.format(index=1), existing_names=table_names, name_format=table_name_format, start=1, ) elif not _valid_table_name(table_name): raise ValueError("Invalid table name: {}".format(table_name)) field_names = next(prepared_table) field_types = list(map(table.fields.get, field_names)) columns = [ "{} {}".format(field_name, SQLITE_TYPES.get(field_type, DEFAULT_TYPE)) for field_name, field_type in zip(field_names, field_types) ] cursor.execute( SQL_CREATE_TABLE.format(table_name=table_name, field_types=", ".join(columns))) insert_sql = SQL_INSERT.format( table_name=table_name, field_names=", ".join(field_names), placeholders=", ".join("?" for _ in field_names), ) _convert_row = _python_to_sqlite(field_types) if callback is None: for batch in ipartition(prepared_table, batch_size): cursor.executemany(insert_sql, map(_convert_row, batch)) else: total_written = 0 for batch in ipartition(prepared_table, batch_size): cursor.executemany(insert_sql, map(_convert_row, batch)) written = len(batch) total_written += written callback(written, total_written) connection.commit() return connection
def export_to_csv( table, filename_or_fobj=None, encoding="utf-8", dialect=unicodecsv.excel, batch_size=100, callback=None, *args, **kwargs ): """Export a `rows.Table` to a CSV file. If a file-like object is provided it MUST be in binary mode, like in `open(filename, mode='wb')`. If not filename/fobj is provided, the function returns a string with CSV contents. """ # TODO: will work only if table.fields is OrderedDict # TODO: should use fobj? What about creating a method like json.dumps? if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode="wb") else: fobj = BytesIO() # TODO: may use `io.BufferedWriter` instead of `ipartition` so user can # choose the real size (in Bytes) when to flush to the file system, instead # number of rows writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect) if callback is None: for batch in ipartition(serialize(table, *args, **kwargs), batch_size): writer.writerows(batch) else: serialized = serialize(table, *args, **kwargs) writer.writerow(next(serialized)) # First, write the header total = 0 for batch in ipartition(serialized, batch_size): writer.writerows(batch) total += len(batch) callback(total) if filename_or_fobj is not None: fobj.flush() return fobj else: fobj.seek(0) result = fobj.read() fobj.close() return result
def export_to_csv(table, filename_or_fobj=None, encoding="utf-8", dialect=unicodecsv.excel, batch_size=100, callback=None, *args, **kwargs): """Export a `rows.Table` to a CSV file. If a file-like object is provided it MUST be in binary mode, like in `open(filename, mode='wb')`. If not filename/fobj is provided, the function returns a string with CSV contents. """ # TODO: will work only if table.fields is OrderedDict # TODO: should use fobj? What about creating a method like json.dumps? if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode="wb") else: fobj = BytesIO() # TODO: may use `io.BufferedWriter` instead of `ipartition` so user can # choose the real size (in Bytes) when to flush to the file system, instead # number of rows writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect) if callback is None: for batch in ipartition(serialize(table, *args, **kwargs), batch_size): writer.writerows(batch) else: serialized = serialize(table, *args, **kwargs) writer.writerow(next(serialized)) # First, write the header total = 0 for batch in ipartition(serialized, batch_size): writer.writerows(batch) total += len(batch) callback(total) if filename_or_fobj is not None: fobj.flush() return fobj else: fobj.seek(0) result = fobj.read() fobj.close() return result
def import_file(connection, filename, drop_sql, create_sql, batch_size=100): cursor = connection.cursor() cursor.execute(drop_sql) cursor.execute(create_sql) connection.commit() header = [line.strip().split()[0] for line in create_sql.split('(')[1].split(')')[0].splitlines() if line.strip()] placeholders = ', '.join('?' for _ in header) header_names = ', '.join(header) table_name = create_sql.split('(')[0].strip().split()[-1] insert_sql = f'INSERT INTO {table_name} ({header_names}) VALUES ({placeholders})' with lzma.LZMAFile(filename, mode='rb', format=lzma.FORMAT_XZ) as fobj, \ io.TextIOWrapper(fobj, encoding='utf-8') as fobj2: counter = 0 for batch in ipartition(csv.DictReader(fobj2), batch_size): counter += len(batch) cursor.executemany( insert_sql, [[row[field] for field in header] for row in batch], ) if counter % 100000 == 0: print(counter) if counter % 100000 != 0: print(counter) connection.commit()
def convert_file(filename, connection, tablename, fields, input_encoding): print(f'Converting {filename}...') drop_sql = f'DROP TABLE IF EXISTS {tablename}' fields_text = ', '.join(f'{field_name} {field_type}' for field_name, field_type in fields.items()) create_sql = f'CREATE TABLE IF NOT EXISTS {tablename} ({fields_text});' header = list(fields.keys()) placeholders = ', '.join('?' for _ in header) header_names = ', '.join(header) insert_sql = f'INSERT INTO {tablename} ({header_names}) VALUES ({placeholders})' cursor = connection.cursor() cursor.execute(drop_sql) cursor.execute(create_sql) with lzma.open(filename) as fobj: fobj = io.TextIOWrapper(fobj, encoding=input_encoding) counter = 0 batch_size = 100000 for batch in ipartition(csv.DictReader(fobj), batch_size): cursor.executemany( insert_sql, [[row[field] for field in header] for row in batch], ) counter += len(batch) if counter % 10000 == 0: print(f' {counter}', end='\r') print(f' {counter} - done.') connection.commit()
def handle(self, *args, **options): results = tuple(self.politicians_and_results()) kwargs = {"desc": "Election results", "total": len(results), "unit": "results"} with tqdm(**kwargs) as progress_bar: for bulk in ipartition(results, 4096): bulk = tuple(self.serialize_bulk(bulk)) bulk_update(bulk, update_fields=("election_history",)) progress_bar.update(len(bulk))
def import_file(filename, Model, encoding='utf-8', batch_size=5000): reader = csv.DictReader(get_fobj(filename, encoding)) counter = 0 for batch in ipartition(reader, batch_size): Model.objects.bulk_create([Model(**data) for data in batch]) counter += len(batch) print(counter) connection.commit()
def create_database(self, input_filename, encoding='utf-8'): connection = self.connection tablename = self.tablename fields = { 'alternative_names': 'TEXT', 'classification': 'TEXT', 'first_name': 'TEXT', 'frequency_female': 'INT', 'frequency_male': 'INT', 'frequency_total': 'INT', 'frequency_group': 'INT', 'group_name': 'TEXT', 'ratio': 'FLOAT', } temptable = f'{tablename}_temp' field_types = ', '.join(f'{name} {type_}' for name, type_ in fields.items()) sql_drop_table = 'DROP TABLE IF EXISTS {tablename}' sql_create_temptable = f'CREATE TABLE {temptable} ({field_types})' sql_create_index = ''' CREATE INDEX idx_{tablename}_name_classification ON {tablename} (first_name, classification) ''' sql_create_table = f''' CREATE TABLE {tablename} AS SELECT * FROM {temptable} GROUP BY first_name ORDER BY first_name ''' cursor = connection.cursor() cursor.execute(sql_drop_table.format(tablename=temptable)) cursor.execute(sql_drop_table.format(tablename=tablename)) cursor.execute(sql_create_temptable) connection.commit() fobj = io.TextIOWrapper( lzma.open(input_filename, mode='r'), encoding=encoding, ) progress = tqdm(csv.DictReader(fobj)) for batch in ipartition(progress, self.batch_size): self._insert_names( temptable, [ row['name'] for row in batch if row['document_type'] == 'CPF' ], ) cursor.execute(sql_create_index.format(tablename=temptable)) connection.commit() cursor.execute(sql_create_table) cursor.execute(sql_create_index.format(tablename=tablename)) cursor.execute(sql_drop_table.format(tablename=temptable)) connection.commit() self._vacuum_db()
def post_handle(self): assets = tuple(self.assets_per_politician_per_year()) kwargs = { "desc": f"Calculating {Asset._meta.verbose_name} per year/politician", "total": len(assets), "unit": "politician", } with tqdm(**kwargs) as progress_bar: for bulk in ipartition(assets, 4096): bulk = tuple(self.serialize_bulk(bulk)) bulk_update(bulk, update_fields=["asset_history"]) progress_bar.update(len(bulk))
def link_campaign(self, year): kwargs = { "desc": str(year), "total": Candidate.objects.campaign(year).exclude(voter_id=None).count(), "unit": "links", } with tqdm(**kwargs) as progress_bar: for bulk in ipartition(self.linked_candidates(year), 4096): bulk_update(bulk, update_fields=("politician", )) progress_bar.update(len(bulk))
def export_to_postgresql( table, connection_or_uri, table_name=None, table_name_format="table{index}", batch_size=100, close_connection=False, *args, **kwargs ): # TODO: should add transaction support? if table_name is not None and not _valid_table_name(table_name): raise ValueError("Invalid table name: {}".format(table_name)) prepared_table = prepare_to_export(table, *args, **kwargs) connection = _get_connection(connection_or_uri) cursor = connection.cursor() if table_name is None: cursor.execute(SQL_TABLE_NAMES) table_names = [item[0] for item in cursor.fetchall()] table_name = make_unique_name( table_name_format.format(index=1), existing_names=table_names, name_format=table_name_format, start=1, ) field_names = next(prepared_table) field_types = list(map(table.fields.get, field_names)) columns = [ "{} {}".format(field_name, SQL_TYPES.get(field_type, DEFAULT_TYPE)) for field_name, field_type in zip(field_names, field_types) ] cursor.execute( SQL_CREATE_TABLE.format(table_name=table_name, field_types=", ".join(columns)) ) insert_sql = SQL_INSERT.format( table_name=table_name, field_names=", ".join(field_names), placeholders=", ".join("%s" for _ in field_names), ) _convert_row = _python_to_postgresql(field_types) for batch in ipartition(prepared_table, batch_size): cursor.executemany(insert_sql, map(_convert_row, batch)) connection.commit() cursor.close() if close_connection: connection.close() return connection, table_name
def transform_cnae_secundaria(row): """Transform row of type CNAE""" cnaes = [ "".join(digits) for digits in ipartition(row.pop("cnae"), 7) if set(digits) != set(["0"]) ] data = [] for cnae in cnaes: new_row = row.copy() new_row["cnae"] = cnae data.append(new_row) return data
def _export_csv(self, query, filename, encoding): cursor = self.connection.cursor() cursor.execute(query) header = [item[0] for item in cursor.description] binary_fobj = lzma.open(filename, mode='w') fobj = io.TextIOWrapper(binary_fobj, encoding=encoding) writer = csv.DictWriter(fobj, fieldnames=header) writer.writeheader() with tqdm() as progress: for batch in ipartition(cursor.fetchall(), self.batch_size): writer.writerows([dict(zip(header, row)) for row in batch]) progress.n += len(batch) progress.refresh()
def export_to_postgresql(table, connection_or_uri, table_name=None, table_name_format="table{index}", batch_size=100, close_connection=False, *args, **kwargs): # TODO: should add transaction support? if table_name is not None and not _valid_table_name(table_name): raise ValueError("Invalid table name: {}".format(table_name)) prepared_table = prepare_to_export(table, *args, **kwargs) connection = _get_connection(connection_or_uri) cursor = connection.cursor() if table_name is None: cursor.execute(SQL_TABLE_NAMES) table_names = [item[0] for item in cursor.fetchall()] table_name = make_unique_name( table_name_format.format(index=1), existing_names=table_names, name_format=table_name_format, start=1, ) field_names = next(prepared_table) field_types = list(map(table.fields.get, field_names)) columns = [ "{} {}".format(field_name, SQL_TYPES.get(field_type, DEFAULT_TYPE)) for field_name, field_type in zip(field_names, field_types) ] cursor.execute( SQL_CREATE_TABLE.format(table_name=table_name, field_types=", ".join(columns))) insert_sql = SQL_INSERT.format( table_name=table_name, field_names=", ".join(field_names), placeholders=", ".join("%s" for _ in field_names), ) _convert_row = _python_to_postgresql(field_types) for batch in ipartition(prepared_table, batch_size): cursor.executemany(insert_sql, map(_convert_row, batch)) connection.commit() cursor.close() if close_connection: connection.close() return connection, table_name
def handle(self, *args, **options): self.path = Path(options['csv']) if not self.path.exists(): raise CommandError(f'{self.path} does not exist') with open(self.path) as fobj: reader = DictReader(fobj, fieldnames=self.headers) next(reader) # skip header row data = (self.model(**line) for line in reader) for bulk in ipartition(data, self.bulk_size): self.model.objects.bulk_create(bulk) self.stats(len(bulk)) print(self.message)
def classify_names(self, workers=16): connection = self.connection tablename = self.tablename query = f''' SELECT first_name FROM {tablename} WHERE classification = '' OR classification IS NULL ''' update_sql = f''' UPDATE {tablename} SET alternative_names = ?, classification = ?, frequency_female = ?, frequency_male = ?, frequency_total = ?, ratio = ? WHERE first_name = ? ''' with Pool(processes=workers) as pool, tqdm() as progress: cursor = connection.cursor() remaining = self.count_not_classified() batch_size = workers * 2 while remaining: cursor.execute(query) header = [item[0] for item in cursor.description] progress.total = remaining for batch in ipartition(cursor.fetchall(), batch_size): names = [ dict(zip(header, row))['first_name'] for row in batch ] results = pool.map(download_name_stats, names) update_data = [] for name, result in zip(names, results): update_data.append(serialize_row(name, result)) cursor.executemany(update_sql, update_data) connection.commit() progress.n += len(batch) progress.update() self.extract_alternatives() remaining = self.count_not_classified()
def __enter__(self): with self.open(self.csv_path, "rt") as input: reader = csv.reader(input) headers = self.headers or next(reader) total = self.total_slices desc = f"Slicing {self.csv_path} into smaller files" with tqdm(total=total, desc=desc, unit="slices") as progress_bar: slices = ipartition(reader, self.bulk_size) for count, lines in enumerate(slices): output_path = Path(self.tmp.name) / f"{count}{self.extension}" with self.open(output_path, "wt") as output: writer = csv.writer(output) writer.writerow(headers) writer.writerows(lines) self.slices.append(output_path) progress_bar.update(1) return self
def extract_alternatives(self): connection = self.connection tablename = self.tablename query = f''' SELECT first_name, alternative_names FROM {tablename} ''' cursor = connection.cursor() cursor.execute(query) header = [item[0] for item in cursor.description] data = [dict(zip(header, row)) for row in cursor.fetchall()] names, alternatives = set(), set() for row in data: names.add(row['first_name']) if row['alternative_names']: alternatives.update(row['alternative_names'].split('|')) new_names = correct_names(alternatives - names) for batch in ipartition(new_names, self.batch_size): self._insert_names(tablename, batch) connection.commit()