def _import_helper(repo: Dolt, table_name: str, write_import_file: Callable[[str], None], primary_keys: List[str], import_mode: str) -> None: import_modes = IMPORT_MODES_TO_FLAGS.keys() if import_mode is not None: assert import_mode in import_modes, 'update_mode must be one of: {}'.format( import_modes) else: if table_name in [table.name for table in repo.ls()]: logger.info( 'No import mode specified, table exists, using "{}"'.format( UPDATE)) import_mode = UPDATE else: logger.info( 'No import mode specified, table exists, using "{}"'.format( CREATE)) import_mode = CREATE import_flags = IMPORT_MODES_TO_FLAGS[import_mode] logger.info( 'Importing to table {} in dolt directory located in {}, import mode {}' .format(table_name, repo.repo_dir(), import_mode)) fp = tempfile.NamedTemporaryFile(suffix='.csv') write_import_file(fp.name) args = [ 'table', 'import', table_name, '--pk={}'.format(','.join(primary_keys)) ] + import_flags repo.execute(args + [fp.name])
def test_init_new_repo(tmp_path): repo_path, repo_data_dir = get_repo_path_tmp_path(tmp_path) assert not os.path.exists(repo_data_dir) dolt = Dolt(repo_path) dolt.init_new_repo() assert os.path.exists(repo_data_dir) shutil.rmtree(repo_data_dir)
def init_repo(tmp_path) -> Dolt: repo_path, repo_data_dir = get_repo_path_tmp_path(tmp_path) assert not os.path.exists(repo_data_dir) repo = Dolt(repo_path) repo.init_new_repo() yield repo if os.path.exists(repo_data_dir): shutil.rmtree(repo_data_dir)
def inner(table_name: str, repo: Dolt) -> DoltTableUpdate: if branch and branch != repo.log(): repo.checkout(branch) query_commit = commit_ref or list(repo.log().keys())[0] table = get_table_metadata(repo.engine, table_name) from_commit, to_commit = get_from_commit_to_commit(repo, query_commit) pks_to_drop = get_dropped_pks(repo.engine, table, from_commit, to_commit) result = _read_from_dolt_history(repo.engine, table, query_commit) return pks_to_drop, result
def inner(repo: Dolt): _, current_branches = repo.branch() branches = [branch.name for branch in current_branches] assert new_branch_name not in branches, 'Branch {} already exists'.format( new_branch_name) logger.info( 'Creating new branch on repo in {} named {} at refspec {}'.format( repo.repo_dir, new_branch_name, refspec)) repo.branch(new_branch_name) return new_branch_name
def write_to_table(repo: Dolt, table: Table, data: List[dict], commit: bool = False, message: str = None): """ Given a repo, table, and data, will try and use the repo's MySQL Server instance to write the provided data to the table. Since Dolt does not yet support ON DUPLICATE KEY clause to INSERT statements we also have to separate updates from inserts and run sets of statements. :param repo: :param table: :param data: :param commit: :param message: :return: """ coerced_data = list(clean_types(data)) inserts, updates = get_inserts_and_updates(repo.engine, table, coerced_data) if inserts: logger.info('Inserting {} rows'.format(len(inserts))) with repo.engine.connect() as conn: conn.execute(table.insert(), inserts) # We need to prefix the columns with "_" in order to use bindparam properly from copy import deepcopy _updates = deepcopy(updates) for dic in _updates: for col in list(dic.keys()): dic['_{}'.format(col)] = dic.pop(col) if _updates: logger.info('Updating {} rows'.format(len(_updates))) with repo.engine.connect() as conn: statement = table.update() for pk_col in [ col.name for col in table.columns if col.primary_key ]: statement = statement.where( table.c[pk_col] == bindparam('_{}'.format(pk_col))) non_pk_cols = [ col.name for col in table.columns if not col.primary_key ] statement = statement.values( {col: bindparam('_{}'.format(col)) for col in non_pk_cols}) conn.execute(statement, _updates) if commit: repo.add(str(table.name)) message = message or 'Inserting {} records at '.format( len(data), datetime.now()) repo.commit(message)
def test_config_local(init_empty_test_repo): repo = init_empty_test_repo current_global_config = Dolt.config_global(list=True) test_username, test_email = 'test_user', 'test_email' repo.config_local(add=True, name='user.name', value=test_username) repo.config_local(add=True, name='user.email', value=test_email) local_config = repo.config_local(list=True) global_config = Dolt.config_global(list=True) assert local_config['user.name'] == test_username and local_config[ 'user.email'] == test_email assert global_config['user.name'] == current_global_config['user.name'] assert global_config['user.email'] == current_global_config['user.email']
def read_table(repo: Dolt, table_name: str, delimiter: str = ',') -> pd.DataFrame: """ Reads the contents of a table and returns it as a Pandas `DataFrame`. Under the hood this uses export and the filesystem, in short order we are likley to replace this with use of the MySQL Server. :param repo: :param table_name: :param delimiter: :return: """ fp = tempfile.NamedTemporaryFile(suffix='.csv') repo.execute(['table', 'export', table_name, fp.name, '-f']) result = pd.read_csv(fp.name, delimiter=delimiter) return result
def inner(table_name: str, repo: Dolt) -> DoltTableUpdate: current_branch, _ = repo.branch() if branch and branch != current_branch: repo.checkout(branch) from_commit, to_commit = get_from_commit_to_commit(repo, commit_ref) metadata = MetaData(bind=repo.engine) metadata.reflect() table = metadata.tables[table_name] pks_to_drop = get_dropped_pks(repo.engine, table, from_commit, to_commit) result = _read_from_dolt_diff(repo.engine, table, from_commit, to_commit) return pks_to_drop, result
def init_empty_test_repo(tmp_path) -> Dolt: repo_path, repo_data_dir = get_repo_path_tmp_path(tmp_path) assert not os.path.exists(repo_data_dir) repo = Dolt.init(repo_path, ServerConfig(loglevel='trace', timeout=1000000)) yield repo if os.path.exists(repo_data_dir): shutil.rmtree(repo_data_dir)
def init_other_empty_test_repo(tmp_path) -> Dolt: repo_path, repo_data_dir = get_repo_path_tmp_path(tmp_path, 'other') assert not os.path.exists(repo_data_dir) os.mkdir(repo_path) repo = Dolt.init(repo_path, ServerConfig(port=3307)) yield repo if os.path.exists(repo_data_dir): shutil.rmtree(repo_data_dir)
def inner(repo: Dolt): _import_mode = import_mode or ( 'create' if table not in [t.name for t in repo.ls()] else 'update') data_to_load = _apply_file_transformers(get_data(), transformers) bulk_import(repo, table, data_to_load, pk_cols, import_mode=_import_mode) return table
def inner(repo: Dolt): _transformers = transformers + [insert_unique_key ] if transformers else [ insert_unique_key ] data = _apply_df_transformers(get_data(), _transformers) if table not in [t.name for t in repo.ls()]: raise ValueError('Missing table') # Get existing PKs existing = read_table(repo, table) existing_pks = existing[INSERTED_ROW_HASH_COL].to_list() # Get proposed PKs proposed_pks = data[INSERTED_ROW_HASH_COL].to_list() to_drop = [ existing for existing in existing_pks if existing not in proposed_pks ] if to_drop: iterator = iter(to_drop) while iterator: batch = list(itertools.islice(iterator, 30000)) if len(batch) == 0: break logger.info('Dropping batch of {} IDs from table {}'.format( len(batch), table)) drop_statement = ''' DELETE FROM {table} WHERE {pk} in ("{pks_to_drop}") '''.format(table=table, pk=INSERTED_ROW_HASH_COL, pks_to_drop='","'.join(batch)) repo.sql(query=drop_statement) new_data = data[~(data[INSERTED_ROW_HASH_COL].isin(existing_pks))] if not new_data.empty: logger.info('Importing {} records'.format(len(new_data))) import_df(repo, table, new_data, [INSERTED_ROW_HASH_COL], 'update') return table
def sync_schema_to_dolt(source_engine: Engine, repo: Dolt, table_map: Mapping[str, str], type_mapping: dict): """ :param source_engine: :param repo: :param table_map: :param type_mapping: :return: """ source_metadata = MetaData(bind=source_engine) source_metadata.reflect() target_metadata = MetaData(bind=repo.get_engine()) target_metadata.reflect() for source_table_name, target_table_name in table_map.items(): source_table = source_metadata.tables[source_table_name] target_table = coerce_schema_to_dolt(target_table_name, source_table, type_mapping) if target_table_name in target_metadata.tables.keys(): target_table.drop(repo.get_engine()) target_table.create(repo.get_engine())
def _import_helper(repo: Dolt, table_name: str, write_import_file: Callable[[str], None], primary_keys: List[str], import_mode: str) -> None: import_modes = IMPORT_MODES_TO_FLAGS.keys() if import_mode is not None: assert import_mode in import_modes, 'update_mode must be one of: {}'.format( import_modes) else: if table_name in [table.name for table in repo.ls()]: logger.info( 'No import mode specified, table exists, using "{}"'.format( UPDATE)) import_mode = UPDATE else: logger.info( 'No import mode specified, table exists, using "{}"'.format( CREATE)) import_mode = CREATE if import_mode == CREATE and primary_keys is None: raise ValueError( 'Import mode CREATE requires a primary key to be specified') import_flags = IMPORT_MODES_TO_FLAGS[import_mode] logger.info( 'Importing to table {} in dolt directory located in {}, import mode {}' .format(table_name, repo.repo_dir(), import_mode)) fname = tempfile.mktemp(suffix='.csv') try: write_import_file(fname) args = ['table', 'import', table_name] + import_flags if import_mode == CREATE: args += ['--pk={}'.format(','.join(primary_keys))] repo.execute(args + [fname]) finally: if os.path.exists(fname): os.remove(fname)
def _create_table_inferred(repo: Dolt, table_name: str, data: Mapping[str, List[Any]], primary_keys: List[str]): # generate and execute a create table statement cols_to_types = {} for col_name, list_of_values in data.items(): # Just take the first value to by the type first_non_null = None for val in list_of_values: if val is not None: first_non_null = val break raise ValueError( 'Cannot provide an empty list, types cannot be inferred') cols_to_types[col_name] = _get_col_type(first_non_null, list_of_values) metadata = MetaData(bind=repo.get_engine()) table = _get_table_def(metadata, table_name, cols_to_types, primary_keys) table.create()
def _test_dolt_table_reader_helper(repo: Dolt, table: Table, build_table_reader: Callable[[str], Callable[[str, Dolt], DoltTableUpdate]], get_expected: Callable[[int], Tuple[List[dict], List[dict]]]): commits = list(repo.log().keys()) update_to_commit = { FIRST_UPDATE: commits[4], SECOND_UPDATE: commits[3], THIRD_UPDATE: commits[2], FOURTH_UPDATE: commits[1], FIFTH_UPDATE: commits[0] } for update_num, commit in update_to_commit.items(): logger.info('comparison for commit/update_num {}/{}'.format(commit, update_num)) dropped_pks, dolt_data = build_table_reader(commit)(str(table.name), repo) expected_dropped_pks, expected_data = get_expected(update_num) assert expected_dropped_pks == dropped_pks assert_rows_equal(expected_data, list(dolt_data))
def get_from_commit_to_commit(repo: Dolt, commit_ref: str = None) -> Tuple[str, str]: """ Given a repo and commit it returns the commit and its parent, if no commit is provided the head and the parent of head are returned. :param repo: :param commit_ref: :return: """ commits = list(repo.log().keys()) commit_ref_index = None if not commit_ref: commit_ref_index = 0 else: for i, commit in enumerate(commits): if commit == commit_ref: commit_ref_index = i break assert commit_ref_index is not None, 'commit_ref not found in commit index' return commits[commit_ref_index + 1], commits[commit_ref_index]
def _create_table_from_schema_import_helper( repo: Dolt, table: str, pks: List[str], path: str, transformers: List[DataframeTransformer] = None, commit: bool = True, commit_message: str = None): if transformers: fp = tempfile.NamedTemporaryFile(suffix='.csv') temp = pd.read_csv(path) transformed = _apply_df_transformers(temp, transformers) transformed.to_csv(fp.name, index=False) path = fp.name repo.schema_import(table=table, pks=pks, filename=path, create=True) if commit: message = commit_message or 'Creating table {}'.format(table) repo.add(table) repo.commit(message)
def test_bad_repo_path(tmp_path): bad_repo_path = tmp_path with pytest.raises(AssertionError): Dolt(bad_repo_path)
def test_config_global(init_empty_test_repo): _ = init_empty_test_repo current_global_config = Dolt.config_global(list=True) test_username, test_email = 'test_user', 'test_email' Dolt.config_global(add=True, name='user.name', value=test_username) Dolt.config_global(add=True, name='user.email', value=test_email) updated_config = Dolt.config_global(list=True) assert updated_config['user.name'] == test_username and updated_config[ 'user.email'] == test_email Dolt.config_global(add=True, name='user.name', value=current_global_config['user.name']) Dolt.config_global(add=True, name='user.email', value=current_global_config['user.email']) reset_config = Dolt.config_global(list=True) assert reset_config['user.name'] == current_global_config['user.name'] assert reset_config['user.email'] == current_global_config['user.email']
def _verify_branches(repo: Dolt, branch_list: List[str]): _, branches = repo.branch() assert set(branch.name for branch in branches) == set(branch for branch in branch_list)
"Joseph R. Biden and\nKamala D. Harris (D)": "JOSEPH BIDEN", "Jo Jorgensen and\nJeremy \"Spike\" Cohen (L)": "JO JORGENSEN", "Donald J. Trump and\nMichael R. Pence (R)": "DONALD TRUMP", "Brock Pierce and\nKarla Ballard (I)": "BROCK PIERCE", "Write-Ins": "WRITE-IN" } party_chart: dict = { "JOSEPH BIDEN": "DEMOCRATIC", "JO JORGENSEN": "LIBERTARIAN", "DONALD TRUMP": "REPUBLICAN", "BROCK PIERCE": "NEW YORK INDEPENDENCE PARTY", "WRITE-IN": "None" } repo = Dolt("working/us-president-precinct-results") sql_file = open( "working/us-president-precinct-results/sql-import-me-wyoming.sql", mode="w") wyoming_precincts: List[str] = [] with open("./working/us-president-precinct-results/wyoming-precincts.csv", 'r') as f: for line in f: # print(line, end='') wyoming_precincts.append(line) for file in os.listdir("working/Wyoming_General_CSV/"): if "County" not in file: continue
def get_raw_data(repo: Dolt): return pd.concat([ repo.read_table(MENS_MAJOR_COUNT).assign(gender='mens'), repo.read_table(WOMENS_MAJOR_COUNT).assign(gender='womens') ])
def import_dict(repo: Dolt, table_name: str, data: Mapping[str, List[Any]], primary_keys: List[str] = None, import_mode: str = None, batch_size: int = DEFAULT_BATCH_SIZE): """ Provides a column major interface for writing Python data structures to Dolt, specifically data should be a dict where the keys are column names and the values are equal length lists of values to be written to Dolt. The lists must consist of: - values that match the type of the table in the schema of the table being written to - values of the same type that can be coalesced to a Python type by the (very limited) type inference logic for generating a schema from a data structure Note it is necessary for all list to be of the same length since we must coalesce the lists into rows, and that doesn't really make sense when the lists are not of the same length. Let's proceed with the example of creating a simple table and showing how to write some data structures: CREATE TABLE players (id INT, name VARCHAR(16), PRIMARY KEY (id)) Now write in update mode: >>> dict_of_lists = {'id': [1, 2], 'name': ['Roger', 'Rafael']} >>> import_dict(repo, 'players', dict_of_lists, import_mode='update') Alternatively we can let the Python code infer a schema: >>> import_dict(repo, 'players', dict_of_lists, ['id'], import_mode='create') Assertions: - all list values are of equal length - when inferring a schema each list value has elements of a type that can be mapped to a SQL type, the logic is currently very limited - when inferring a schema This function requires the Dolt SQL server to be running on the host and port provided, defaulting to 127.0.0.1:3306. :param repo: :param table_name: :param data: :param primary_keys: :param import_mode: :param batch_size: :return: """ assert import_mode in [UPDATE, CREATE] # Grab some basic information about the data assert data, 'Cannot provide an empty dictionary' row_count = len(list(data.values())[0]) assert row_count > 0, 'Must provide at least a single row' assert all(len(val_list) == row_count for val_list in data.values()), 'Must provide value lists of uniform length' # Get an Engine object # If the table does not exist, create it using type inference to build a create statement if import_mode == CREATE: assert primary_keys, 'primary_keys need to be provided when inferring a schema' _create_table_inferred(repo, table_name, data, primary_keys) rows = [] for i in range(row_count): rows.append({col: data[col][i] for col in data.keys()}) clean_rows = coerce_dates(rows) logger.info('Inserting {row_count} rows into table {table_name}'.format( row_count=row_count, table_name=table_name)) metadata = MetaData(bind=repo.get_engine()) metadata.reflect() table = metadata.tables[table_name] for i in range(max(1, math.ceil(len(clean_rows) / batch_size))): batch_start = i * batch_size batch_end = min((i + 1) * batch_size, len(clean_rows)) batch = clean_rows[batch_start:batch_end] logger.info('Writing records {} through {} of {} rows to Dolt'.format( batch_start, batch_end, len(clean_rows))) with repo.get_engine().connect() as conn: conn.execute(table.insert(), batch)
def inner(repo: Dolt): current_branch, current_branch_list = repo.branch() original_branch = current_branch.name if branch != original_branch and not commit: raise ValueError( 'If writes are to another branch, and commit is not True, writes will be lost' ) if current_branch.name != branch: logger.info('Current branch is {}, checking out {}'.format( current_branch.name, branch)) if branch not in [b.name for b in current_branch_list]: logger.info('{} does not exist, creating'.format(branch)) repo.branch(branch_name=branch) repo.checkout(branch) if transaction_mode: raise NotImplementedError( 'transaction_mode is not yet implemented') tables_updated = [writer(repo) for writer in writers] if commit: if not repo.status().is_clean: logger.info( 'Committing to repo located in {} for tables:\n{}'.format( repo.repo_dir, tables_updated)) for table in tables_updated: repo.add(table) repo.commit(message) else: logger.warning('No changes to repo in:\n{}'.format( repo.repo_dir)) current_branch, branches = repo.branch() if original_branch != current_branch.name: logger.info( 'Checked out {} from {}, checking out {} to restore state'. format([b.name for b in branches], original_branch, original_branch)) repo.checkout(original_branch) return branch
def _dolt_table_read_helper(repo: Dolt, table_name: str): table = get_table_metadata(repo.get_engine(), table_name) with repo.get_engine().connect() as conn: result = conn.execute(table.select()) return [dict(row) for row in result]