Exemplo n.º 1
0
def export_to_sqlite(
    table,
    filename_or_connection,
    table_name=None,
    table_name_format="table{index}",
    batch_size=100,
    callback=None,
    *args,
    **kwargs
):
    # TODO: should add transaction support?
    prepared_table = prepare_to_export(table, *args, **kwargs)
    connection = _get_connection(filename_or_connection)
    cursor = connection.cursor()

    if table_name is None:
        table_names = [item[0] for item in cursor.execute(SQL_TABLE_NAMES)]
        table_name = make_unique_name(
            table_name_format.format(index=1),
            existing_names=table_names,
            name_format=table_name_format,
            start=1,
        )

    elif not _valid_table_name(table_name):
        raise ValueError("Invalid table name: {}".format(table_name))

    field_names = next(prepared_table)
    field_types = list(map(table.fields.get, field_names))
    columns = [
        "{} {}".format(field_name, SQLITE_TYPES.get(field_type, DEFAULT_TYPE))
        for field_name, field_type in zip(field_names, field_types)
    ]
    cursor.execute(
        SQL_CREATE_TABLE.format(table_name=table_name, field_types=", ".join(columns))
    )

    insert_sql = SQL_INSERT.format(
        table_name=table_name,
        field_names=", ".join(field_names),
        placeholders=", ".join("?" for _ in field_names),
    )
    _convert_row = _python_to_sqlite(field_types)

    if callback is None:
        for batch in ipartition(prepared_table, batch_size):
            cursor.executemany(insert_sql, map(_convert_row, batch))

    else:
        total_written = 0
        for batch in ipartition(prepared_table, batch_size):
            cursor.executemany(insert_sql, map(_convert_row, batch))
            written = len(batch)
            total_written += written
            callback(written, total_written)

    connection.commit()
    return connection
Exemplo n.º 2
0
    def test_ipartition(self):
        iterable = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        result = plugins_utils.ipartition(iterable, 3)
        self.assertEqual(type(result), types.GeneratorType)
        self.assertEqual(list(result), [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]])

        result = plugins_utils.ipartition(iterable, 2)
        self.assertEqual(type(result), types.GeneratorType)
        self.assertEqual(list(result), [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
Exemplo n.º 3
0
def export_to_sqlite(table,
                     filename_or_connection,
                     table_name=None,
                     table_name_format="table{index}",
                     batch_size=100,
                     callback=None,
                     *args,
                     **kwargs):
    # TODO: should add transaction support?
    prepared_table = prepare_to_export(table, *args, **kwargs)
    connection = _get_connection(filename_or_connection)
    cursor = connection.cursor()

    if table_name is None:
        table_names = [item[0] for item in cursor.execute(SQL_TABLE_NAMES)]
        table_name = make_unique_name(
            table_name_format.format(index=1),
            existing_names=table_names,
            name_format=table_name_format,
            start=1,
        )

    elif not _valid_table_name(table_name):
        raise ValueError("Invalid table name: {}".format(table_name))

    field_names = next(prepared_table)
    field_types = list(map(table.fields.get, field_names))
    columns = [
        "{} {}".format(field_name, SQLITE_TYPES.get(field_type, DEFAULT_TYPE))
        for field_name, field_type in zip(field_names, field_types)
    ]
    cursor.execute(
        SQL_CREATE_TABLE.format(table_name=table_name,
                                field_types=", ".join(columns)))

    insert_sql = SQL_INSERT.format(
        table_name=table_name,
        field_names=", ".join(field_names),
        placeholders=", ".join("?" for _ in field_names),
    )
    _convert_row = _python_to_sqlite(field_types)

    if callback is None:
        for batch in ipartition(prepared_table, batch_size):
            cursor.executemany(insert_sql, map(_convert_row, batch))

    else:
        total_written = 0
        for batch in ipartition(prepared_table, batch_size):
            cursor.executemany(insert_sql, map(_convert_row, batch))
            written = len(batch)
            total_written += written
            callback(written, total_written)

    connection.commit()
    return connection
Exemplo n.º 4
0
def export_to_csv(
    table,
    filename_or_fobj=None,
    encoding="utf-8",
    dialect=unicodecsv.excel,
    batch_size=100,
    callback=None,
    *args,
    **kwargs
):
    """Export a `rows.Table` to a CSV file.


    If a file-like object is provided it MUST be in binary mode, like in
    `open(filename, mode='wb')`.
    If not filename/fobj is provided, the function returns a string with CSV
    contents.
    """
    # TODO: will work only if table.fields is OrderedDict
    # TODO: should use fobj? What about creating a method like json.dumps?

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode="wb")
    else:
        fobj = BytesIO()

    # TODO: may use `io.BufferedWriter` instead of `ipartition` so user can
    # choose the real size (in Bytes) when to flush to the file system, instead
    # number of rows
    writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect)

    if callback is None:
        for batch in ipartition(serialize(table, *args, **kwargs), batch_size):
            writer.writerows(batch)

    else:
        serialized = serialize(table, *args, **kwargs)
        writer.writerow(next(serialized))  # First, write the header
        total = 0
        for batch in ipartition(serialized, batch_size):
            writer.writerows(batch)
            total += len(batch)
            callback(total)

    if filename_or_fobj is not None:
        fobj.flush()
        return fobj
    else:
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Exemplo n.º 5
0
def export_to_csv(table,
                  filename_or_fobj=None,
                  encoding="utf-8",
                  dialect=unicodecsv.excel,
                  batch_size=100,
                  callback=None,
                  *args,
                  **kwargs):
    """Export a `rows.Table` to a CSV file.


    If a file-like object is provided it MUST be in binary mode, like in
    `open(filename, mode='wb')`.
    If not filename/fobj is provided, the function returns a string with CSV
    contents.
    """
    # TODO: will work only if table.fields is OrderedDict
    # TODO: should use fobj? What about creating a method like json.dumps?

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode="wb")
    else:
        fobj = BytesIO()

    # TODO: may use `io.BufferedWriter` instead of `ipartition` so user can
    # choose the real size (in Bytes) when to flush to the file system, instead
    # number of rows
    writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect)

    if callback is None:
        for batch in ipartition(serialize(table, *args, **kwargs), batch_size):
            writer.writerows(batch)

    else:
        serialized = serialize(table, *args, **kwargs)
        writer.writerow(next(serialized))  # First, write the header
        total = 0
        for batch in ipartition(serialized, batch_size):
            writer.writerows(batch)
            total += len(batch)
            callback(total)

    if filename_or_fobj is not None:
        fobj.flush()
        return fobj
    else:
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Exemplo n.º 6
0
def import_file(connection, filename, drop_sql, create_sql, batch_size=100):
    cursor = connection.cursor()

    cursor.execute(drop_sql)
    cursor.execute(create_sql)
    connection.commit()

    header = [line.strip().split()[0]
              for line in create_sql.split('(')[1].split(')')[0].splitlines()
              if line.strip()]
    placeholders = ', '.join('?' for _ in header)
    header_names = ', '.join(header)
    table_name = create_sql.split('(')[0].strip().split()[-1]
    insert_sql = f'INSERT INTO {table_name} ({header_names}) VALUES ({placeholders})'

    with lzma.LZMAFile(filename, mode='rb', format=lzma.FORMAT_XZ) as fobj, \
        io.TextIOWrapper(fobj, encoding='utf-8') as fobj2:

        counter = 0
        for batch in ipartition(csv.DictReader(fobj2), batch_size):
            counter += len(batch)
            cursor.executemany(
                insert_sql,
                [[row[field] for field in header] for row in batch],
            )
            if counter % 100000 == 0:
                print(counter)
        if counter % 100000 != 0:
            print(counter)

    connection.commit()
Exemplo n.º 7
0
def convert_file(filename, connection, tablename, fields, input_encoding):

    print(f'Converting {filename}...')
    drop_sql = f'DROP TABLE IF EXISTS {tablename}'
    fields_text = ', '.join(f'{field_name} {field_type}'
                            for field_name, field_type in fields.items())
    create_sql = f'CREATE TABLE IF NOT EXISTS {tablename} ({fields_text});'
    header = list(fields.keys())
    placeholders = ', '.join('?' for _ in header)
    header_names = ', '.join(header)
    insert_sql = f'INSERT INTO {tablename} ({header_names}) VALUES ({placeholders})'

    cursor = connection.cursor()
    cursor.execute(drop_sql)
    cursor.execute(create_sql)

    with lzma.open(filename) as fobj:
        fobj = io.TextIOWrapper(fobj, encoding=input_encoding)
        counter = 0
        batch_size = 100000
        for batch in ipartition(csv.DictReader(fobj), batch_size):
            cursor.executemany(
                insert_sql,
                [[row[field] for field in header] for row in batch],
            )

            counter += len(batch)
            if counter % 10000 == 0:
                print(f'  {counter}', end='\r')
        print(f'  {counter} - done.')

    connection.commit()
 def handle(self, *args, **options):
     results = tuple(self.politicians_and_results())
     kwargs = {"desc": "Election results", "total": len(results), "unit": "results"}
     with tqdm(**kwargs) as progress_bar:
         for bulk in ipartition(results, 4096):
             bulk = tuple(self.serialize_bulk(bulk))
             bulk_update(bulk, update_fields=("election_history",))
             progress_bar.update(len(bulk))
Exemplo n.º 9
0
def import_file(filename, Model, encoding='utf-8', batch_size=5000):
    reader = csv.DictReader(get_fobj(filename, encoding))
    counter = 0
    for batch in ipartition(reader, batch_size):
        Model.objects.bulk_create([Model(**data) for data in batch])
        counter += len(batch)
        print(counter)
    connection.commit()
Exemplo n.º 10
0
    def create_database(self, input_filename, encoding='utf-8'):
        connection = self.connection
        tablename = self.tablename
        fields = {
            'alternative_names': 'TEXT',
            'classification': 'TEXT',
            'first_name': 'TEXT',
            'frequency_female': 'INT',
            'frequency_male': 'INT',
            'frequency_total': 'INT',
            'frequency_group': 'INT',
            'group_name': 'TEXT',
            'ratio': 'FLOAT',
        }
        temptable = f'{tablename}_temp'
        field_types = ', '.join(f'{name} {type_}'
                                for name, type_ in fields.items())
        sql_drop_table = 'DROP TABLE IF EXISTS {tablename}'
        sql_create_temptable = f'CREATE TABLE {temptable} ({field_types})'
        sql_create_index = '''
            CREATE INDEX idx_{tablename}_name_classification
                ON {tablename} (first_name, classification)
        '''
        sql_create_table = f'''
            CREATE TABLE {tablename} AS
                SELECT * FROM {temptable} GROUP BY first_name ORDER BY first_name
        '''

        cursor = connection.cursor()
        cursor.execute(sql_drop_table.format(tablename=temptable))
        cursor.execute(sql_drop_table.format(tablename=tablename))
        cursor.execute(sql_create_temptable)
        connection.commit()

        fobj = io.TextIOWrapper(
            lzma.open(input_filename, mode='r'),
            encoding=encoding,
        )
        progress = tqdm(csv.DictReader(fobj))
        for batch in ipartition(progress, self.batch_size):
            self._insert_names(
                temptable,
                [
                    row['name']
                    for row in batch if row['document_type'] == 'CPF'
                ],
            )
        cursor.execute(sql_create_index.format(tablename=temptable))
        connection.commit()

        cursor.execute(sql_create_table)
        cursor.execute(sql_create_index.format(tablename=tablename))
        cursor.execute(sql_drop_table.format(tablename=temptable))
        connection.commit()

        self._vacuum_db()
Exemplo n.º 11
0
 def post_handle(self):
     assets = tuple(self.assets_per_politician_per_year())
     kwargs = {
         "desc": f"Calculating {Asset._meta.verbose_name} per year/politician",
         "total": len(assets),
         "unit": "politician",
     }
     with tqdm(**kwargs) as progress_bar:
         for bulk in ipartition(assets, 4096):
             bulk = tuple(self.serialize_bulk(bulk))
             bulk_update(bulk, update_fields=["asset_history"])
             progress_bar.update(len(bulk))
Exemplo n.º 12
0
 def link_campaign(self, year):
     kwargs = {
         "desc":
         str(year),
         "total":
         Candidate.objects.campaign(year).exclude(voter_id=None).count(),
         "unit":
         "links",
     }
     with tqdm(**kwargs) as progress_bar:
         for bulk in ipartition(self.linked_candidates(year), 4096):
             bulk_update(bulk, update_fields=("politician", ))
             progress_bar.update(len(bulk))
Exemplo n.º 13
0
def export_to_postgresql(
    table,
    connection_or_uri,
    table_name=None,
    table_name_format="table{index}",
    batch_size=100,
    close_connection=False,
    *args,
    **kwargs
):
    # TODO: should add transaction support?

    if table_name is not None and not _valid_table_name(table_name):
        raise ValueError("Invalid table name: {}".format(table_name))

    prepared_table = prepare_to_export(table, *args, **kwargs)
    connection = _get_connection(connection_or_uri)
    cursor = connection.cursor()
    if table_name is None:
        cursor.execute(SQL_TABLE_NAMES)
        table_names = [item[0] for item in cursor.fetchall()]
        table_name = make_unique_name(
            table_name_format.format(index=1),
            existing_names=table_names,
            name_format=table_name_format,
            start=1,
        )
    field_names = next(prepared_table)
    field_types = list(map(table.fields.get, field_names))
    columns = [
        "{} {}".format(field_name, SQL_TYPES.get(field_type, DEFAULT_TYPE))
        for field_name, field_type in zip(field_names, field_types)
    ]
    cursor.execute(
        SQL_CREATE_TABLE.format(table_name=table_name, field_types=", ".join(columns))
    )

    insert_sql = SQL_INSERT.format(
        table_name=table_name,
        field_names=", ".join(field_names),
        placeholders=", ".join("%s" for _ in field_names),
    )
    _convert_row = _python_to_postgresql(field_types)
    for batch in ipartition(prepared_table, batch_size):
        cursor.executemany(insert_sql, map(_convert_row, batch))

    connection.commit()
    cursor.close()
    if close_connection:
        connection.close()
    return connection, table_name
Exemplo n.º 14
0
def transform_cnae_secundaria(row):
    """Transform row of type CNAE"""

    cnaes = [
        "".join(digits) for digits in ipartition(row.pop("cnae"), 7)
        if set(digits) != set(["0"])
    ]
    data = []
    for cnae in cnaes:
        new_row = row.copy()
        new_row["cnae"] = cnae
        data.append(new_row)

    return data
Exemplo n.º 15
0
    def _export_csv(self, query, filename, encoding):
        cursor = self.connection.cursor()
        cursor.execute(query)
        header = [item[0] for item in cursor.description]

        binary_fobj = lzma.open(filename, mode='w')
        fobj = io.TextIOWrapper(binary_fobj, encoding=encoding)
        writer = csv.DictWriter(fobj, fieldnames=header)
        writer.writeheader()
        with tqdm() as progress:
            for batch in ipartition(cursor.fetchall(), self.batch_size):
                writer.writerows([dict(zip(header, row)) for row in batch])
                progress.n += len(batch)
                progress.refresh()
Exemplo n.º 16
0
def export_to_postgresql(table,
                         connection_or_uri,
                         table_name=None,
                         table_name_format="table{index}",
                         batch_size=100,
                         close_connection=False,
                         *args,
                         **kwargs):
    # TODO: should add transaction support?

    if table_name is not None and not _valid_table_name(table_name):
        raise ValueError("Invalid table name: {}".format(table_name))

    prepared_table = prepare_to_export(table, *args, **kwargs)
    connection = _get_connection(connection_or_uri)
    cursor = connection.cursor()
    if table_name is None:
        cursor.execute(SQL_TABLE_NAMES)
        table_names = [item[0] for item in cursor.fetchall()]
        table_name = make_unique_name(
            table_name_format.format(index=1),
            existing_names=table_names,
            name_format=table_name_format,
            start=1,
        )
    field_names = next(prepared_table)
    field_types = list(map(table.fields.get, field_names))
    columns = [
        "{} {}".format(field_name, SQL_TYPES.get(field_type, DEFAULT_TYPE))
        for field_name, field_type in zip(field_names, field_types)
    ]
    cursor.execute(
        SQL_CREATE_TABLE.format(table_name=table_name,
                                field_types=", ".join(columns)))

    insert_sql = SQL_INSERT.format(
        table_name=table_name,
        field_names=", ".join(field_names),
        placeholders=", ".join("%s" for _ in field_names),
    )
    _convert_row = _python_to_postgresql(field_types)
    for batch in ipartition(prepared_table, batch_size):
        cursor.executemany(insert_sql, map(_convert_row, batch))

    connection.commit()
    cursor.close()
    if close_connection:
        connection.close()
    return connection, table_name
Exemplo n.º 17
0
    def handle(self, *args, **options):
        self.path = Path(options['csv'])
        if not self.path.exists():
            raise CommandError(f'{self.path} does not exist')

        with open(self.path) as fobj:
            reader = DictReader(fobj, fieldnames=self.headers)
            next(reader)  # skip header row

            data = (self.model(**line) for line in reader)
            for bulk in ipartition(data, self.bulk_size):
                self.model.objects.bulk_create(bulk)
                self.stats(len(bulk))

            print(self.message)
Exemplo n.º 18
0
    def classify_names(self, workers=16):
        connection = self.connection
        tablename = self.tablename
        query = f'''
            SELECT first_name
            FROM {tablename}
            WHERE classification = '' OR classification IS NULL
        '''
        update_sql = f'''
            UPDATE {tablename}
            SET
                alternative_names = ?,
                classification = ?,
                frequency_female = ?,
                frequency_male = ?,
                frequency_total = ?,
                ratio = ?
            WHERE first_name = ?
        '''

        with Pool(processes=workers) as pool, tqdm() as progress:
            cursor = connection.cursor()
            remaining = self.count_not_classified()
            batch_size = workers * 2

            while remaining:
                cursor.execute(query)
                header = [item[0] for item in cursor.description]
                progress.total = remaining

                for batch in ipartition(cursor.fetchall(), batch_size):
                    names = [
                        dict(zip(header, row))['first_name'] for row in batch
                    ]
                    results = pool.map(download_name_stats, names)
                    update_data = []
                    for name, result in zip(names, results):
                        update_data.append(serialize_row(name, result))
                    cursor.executemany(update_sql, update_data)
                    connection.commit()
                    progress.n += len(batch)
                    progress.update()
                self.extract_alternatives()
                remaining = self.count_not_classified()
Exemplo n.º 19
0
    def __enter__(self):
        with self.open(self.csv_path, "rt") as input:
            reader = csv.reader(input)
            headers = self.headers or next(reader)

            total = self.total_slices
            desc = f"Slicing {self.csv_path} into smaller files"
            with tqdm(total=total, desc=desc, unit="slices") as progress_bar:
                slices = ipartition(reader, self.bulk_size)
                for count, lines in enumerate(slices):
                    output_path = Path(self.tmp.name) / f"{count}{self.extension}"
                    with self.open(output_path, "wt") as output:
                        writer = csv.writer(output)
                        writer.writerow(headers)
                        writer.writerows(lines)

                    self.slices.append(output_path)
                    progress_bar.update(1)

        return self
Exemplo n.º 20
0
 def extract_alternatives(self):
     connection = self.connection
     tablename = self.tablename
     query = f'''
         SELECT first_name, alternative_names
         FROM {tablename}
     '''
     cursor = connection.cursor()
     cursor.execute(query)
     header = [item[0] for item in cursor.description]
     data = [dict(zip(header, row)) for row in cursor.fetchall()]
     names, alternatives = set(), set()
     for row in data:
         names.add(row['first_name'])
         if row['alternative_names']:
             alternatives.update(row['alternative_names'].split('|'))
     new_names = correct_names(alternatives - names)
     for batch in ipartition(new_names, self.batch_size):
         self._insert_names(tablename, batch)
     connection.commit()