def write_metadata(self, data: List[DoltMeta]): """Important that write metadata commit is recorded immediately after the data commit""" meta_df = pd.DataFrame.from_records( [x.dict() for x in self.table_reads + self.table_writes]) import_df(repo=self.meta_doltdb, table_name="metadata", data=meta_df, primary_keys=meta_df.columns.tolist())
def inner(repo: Dolt): input_data = get_data(repo) transformed_data = transformer(input_data) import_df(repo, target_table, transformed_data, target_pk_cols, import_mode=import_mode) return target_table
def write_table(self, table_name: str, df: pd.DataFrame, pks: List[str]): """ Writes the contents of the given DataFrame to the specified table. If the table exists it is updated, if it does not it is created. """ assert current.is_running_flow, 'Writes and commits are only supported in a running Flow' import_df(repo=self.doltdb, table_name=table_name, data=df, primary_keys=pks) self.table_writes.append(self._get_table_write(table_name))
def _import_and_commit(dolt: Dolt, table: str, data: pd.DataFrame, primary_keys: Optional[List[str]], import_mode: str): dolt_write.import_df(dolt, table, pd.DataFrame(data), primary_keys, import_mode) dolt.add(table) dolt.commit('Executed import on table {} in import mode "{}"'.format( table, import_mode)) commit = dolt.log()[0] return { 'commit_hash': commit.hash, 'timestamp': commit.ts, 'author': commit.author, 'message': commit.message }
def create_test_table(init_empty_test_repo, create_test_data) -> Tuple[Dolt, str]: repo, test_data_path = init_empty_test_repo, create_test_data repo.sql(query=''' CREATE TABLE `test_players` ( `name` LONGTEXT NOT NULL COMMENT 'tag:0', `id` BIGINT NOT NULL COMMENT 'tag:1', PRIMARY KEY (`id`) ); ''') import_df(repo, 'test_players', pd.read_csv(test_data_path), ['id'], UPDATE) yield repo, 'test_players' if 'test_players' in [table.name for table in repo.ls()]: _execute(['table', 'rm', 'test_players'], repo.repo_dir())
def inner(repo: Dolt): _transformers = transformers + [insert_unique_key ] if transformers else [ insert_unique_key ] data = _apply_df_transformers(get_data(), _transformers) if table not in [t.name for t in repo.ls()]: raise ValueError('Missing table') # Get existing PKs existing = read_table(repo, table) existing_pks = existing[INSERTED_ROW_HASH_COL].to_list() # Get proposed PKs proposed_pks = data[INSERTED_ROW_HASH_COL].to_list() to_drop = [ existing for existing in existing_pks if existing not in proposed_pks ] if to_drop: iterator = iter(to_drop) while iterator: batch = list(itertools.islice(iterator, 30000)) if len(batch) == 0: break logger.info('Dropping batch of {} IDs from table {}'.format( len(batch), table)) drop_statement = ''' DELETE FROM {table} WHERE {pk} in ("{pks_to_drop}") '''.format(table=table, pk=INSERTED_ROW_HASH_COL, pks_to_drop='","'.join(batch)) repo.sql(query=drop_statement) new_data = data[~(data[INSERTED_ROW_HASH_COL].isin(existing_pks))] if not new_data.empty: logger.info('Importing {} records'.format(len(new_data))) import_df(repo, table, new_data, [INSERTED_ROW_HASH_COL], 'update') return table
def _insert_row_helper(repo, table, row): import_df(repo, table, row, ['id'], import_mode=UPDATE)
def test_get_dirty_tables(create_test_table): repo, test_table = create_test_table message = 'Committing test data' # Some test data initial = pd.DataFrame({ 'id': [1], 'name': ['Bianca'], 'role': ['Champion'] }) appended_row = pd.DataFrame({ 'name': ['Serena'], 'id': [2], 'role': ['Runner-up'] }) def _insert_row_helper(repo, table, row): import_df(repo, table, row, ['id'], import_mode=UPDATE) # existing, not modified repo.add(test_table) repo.commit(message) # existing, modified, staged modified_staged = 'modified_staged' import_df(repo, modified_staged, initial, ['id']) repo.add(modified_staged) # existing, modified, unstaged modified_unstaged = 'modified_unstaged' import_df(repo, modified_unstaged, initial, ['id']) repo.add(modified_unstaged) # Commit and modify data repo.commit(message) _insert_row_helper(repo, modified_staged, appended_row) import_df(repo, modified_staged, appended_row, ['id'], UPDATE) repo.add(modified_staged) import_df(repo, modified_unstaged, appended_row, ['id'], UPDATE) # created, staged created_staged = 'created_staged' import_df(repo, created_staged, initial, ['id']) repo.add(created_staged) # created, unstaged created_unstaged = 'created_unstaged' import_df(repo, created_unstaged, initial, ['id']) status = repo.status() expected_new_tables = {'created_staged': True, 'created_unstaged': False} expected_changes = {'modified_staged': True, 'modified_unstaged': False} assert status.added_tables == expected_new_tables assert status.modified_tables == expected_changes
def load_to_dolt(df, table): repo = Dolt(DOLT_REPO) import_df(repo, table, df, ['year', 'month'] + crosswalk_table_to_pk[table], import_mode='update')
exit_code = process.wait() return output repo_name = 'Liquidata/online-services' root = '.' repo = Dolt.clone(repo_name, root) documents_df = read_table(repo, 'documents') documents_df['terms_raw'] = documents_df['terms_raw'].astype(str) documents_df['privacy_raw'] = documents_df['privacy_raw'].astype(str) for index, row in documents_df.iterrows(): print(f'Processing {index}') documents_df.at[index, 'terms_raw'] = scrape_document(row['terms_url']) documents_df.at[index, 'privacy_raw'] = scrape_document(row['privacy_url']) import_df(repo, 'documents', documents_df, ['product_id']) if repo.status().is_clean: print('No changes to repo. Exiting') else: print('Commiting and pushing to DoltHub') repo.add('documents') now = datetime.datetime.now() print(f'Latest documents downloaded {now}') repo.commit(f'Latest data downloaded {now}') repo.push('origin', 'master')
def inner(repo: Dolt): _import_mode = import_mode or ( 'create' if table not in [t.name for t in repo.ls()] else 'update') data_to_load = _apply_df_transformers(get_data(), transformers) import_df(repo, table, data_to_load, pk_cols, import_mode=_import_mode) return table