def update_datahub_contact_consent( target_db: str, table: sqlalchemy.Table, **kwargs, ): """ Updates Contacts temp table with email marketing consent data from Consent dataset. """ table = get_temp_table(table, kwargs['ts_nodash']) update_consent_query = f""" UPDATE {table.schema}.{table.name} AS contacts_temp SET email_marketing_consent = consent.email_marketing_consent FROM {ConsentPipeline.fq_table_name()} AS consent WHERE lower(contacts_temp.email) = lower(consent.email) """ engine = sqlalchemy.create_engine( 'postgresql+psycopg2://', creator=PostgresHook(postgres_conn_id=target_db).get_conn, echo=config.DEBUG, ) with engine.begin() as conn: conn.execute(sqlalchemy.text(update_consent_query)) logger.info( 'Updated Contacts temp table with email consent from Consent dataset')
def _get_csvfile_for_each_period(data_for_each_period): for zip_bytes in data_for_each_period: with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive: name = archive.namelist()[0] logger.info('Opening csv file %s in zip', name) with archive.open(name, "r") as file: yield file
def fetch_apple_mobility_data( table_name: str, base_url: str, config_path: str, df_transform: Callable[[pd.DataFrame], pd.DataFrame], page_size: int = 1000, **kwargs, ): s3 = S3Data(table_name, kwargs['ts_nodash']) api_config = requests.get(base_url + config_path).json() source_url = ( base_url + api_config['basePath'] + api_config['regions']['en-us']['csvPath'] ) logger.info(f'Fetching csv from {source_url}') response = requests.get(source_url) df = pd.read_csv(io.StringIO(response.content.decode('utf-8'))) df = df_transform(df) page = 1 for i in range((len(df) // page_size) + 1): results = df.iloc[page_size * i : page_size * (i + 1)].to_json( orient="records", date_format="iso" ) s3.write_key(f"{page:010}.json", results, jsonify=False) page += 1 logger.info('Fetching from source completed')
def fetch_from_hosted_csv( table_name: str, source_url: str, page_size: int = 1000, allow_empty_strings: bool = True, **kwargs, ): s3 = S3Data(table_name, kwargs["ts_nodash"]) results = [] page = 1 with closing(requests.get(source_url, stream=True)) as request: reader = csv.DictReader( codecs.iterdecode(request.iter_lines(), 'utf-8')) for row in reader: if not allow_empty_strings: row = {k: v if v != '' else None for k, v in row.items()} # type: ignore results.append(row) if len(results) >= page_size: s3.write_key(f"{page:010}.json", results) results = [] page += 1 if results: s3.write_key(f"{page:010}.json", results) logger.info("Fetching from source completed")
def _check_table(engine, conn, temp: sa.Table, target: sa.Table, allow_null_columns: bool): logger.info("Checking %s", temp.name) if engine.dialect.has_table(conn, target.name, schema=target.schema): logger.info("Checking record counts") temp_count = conn.execute( sa.select([sa.func.count()]).select_from(temp)).fetchone()[0] target_count = conn.execute( sa.select([sa.func.count()]).select_from(target)).fetchone()[0] logger.info("Current records count %s, new import count %s", target_count, temp_count) if target_count > 0 and temp_count / target_count < 0.9: raise MissingDataError( "New record count is less than 90% of current data") logger.info("Checking for empty columns") for col in temp.columns: row = conn.execute( sa.select([temp]).select_from(temp).where( col.isnot(None)).limit(1)).fetchone() if row is None: error = f"Column {col} only contains NULL values" if allow_null_columns or config.ALLOW_NULL_DATASET_COLUMNS: logger.warning(error) else: raise UnusedColumnError(error) logger.info("All columns are used")
def send_dataset_update_emails(update_emails_data_environment_variable): if update_emails_data_environment_variable not in os.environ: raise ValueError( f"Could not find data in environment for `{update_emails_data_environment_variable}`" ) dataset_info = json.loads( os.environ[update_emails_data_environment_variable]) dataset_url = dataset_info['dataset_url'] dataset_name = dataset_info['dataset_name'] emails = dataset_info['emails'] client = NotificationsAPIClient(os.environ['NOTIFY_API_KEY']) logger.info( f"Sending `dataset updated` emails to subscribers for " f"this pipeline (`{update_emails_data_environment_variable}`).") for email in emails: client.send_email_notification( email_address=email, template_id=os.environ['NOTIFY_TEMPLATE_ID__DATASET_UPDATED'], personalisation={ "dataset_name": dataset_name, "dataset_url": dataset_url }, )
def _download(source_url, params=()): logger.info( 'Downloading %s %s', source_url, [(key, value) for (key, value) in params if key != 'token'], ) response = requests.get(source_url, stream=True, params=params) response.raise_for_status() return response
def _download(source_url): logger.info('Downloading %s', source_url) response = requests.get(source_url) try: response.raise_for_status() except requests.exceptions.HTTPError: logging.error("Request failed: %s", response.text) raise return response.content
def _hawk_api_request( url: str, credentials: dict, results_key: Optional[str], next_key: Optional[str], validate_response: Optional[bool] = True, force_http: Optional[bool] = False, ): sender = Sender( credentials, # Currently data workspace denies hawk requests signed with https urls. # Once fixed the protocol replacement can be removed. url.replace('https', 'http') if force_http else url, "get", content="", content_type="", always_hash_content=True, ) logger.info(f"Fetching page {url}") response = requests.get( url, headers={ "Authorization": sender.request_header, "Content-Type": "" }, timeout=300, ) try: response.raise_for_status() except requests.exceptions.HTTPError: logger.warning(f"Request failed: {response.text}") raise if validate_response: try: sender.accept_response( response.headers["Server-Authorization"], content=response.content, content_type=response.headers["Content-Type"], ) except HawkFail as e: logger.error(f"HAWK Authentication failed {str(e)}") raise response_json = response.json() if (next_key and next_key not in response_json) or ( results_key and results_key not in response_json): raise ValueError("Unexpected response structure") return response_json
def create_views( target_db: str, schema_name: str, table_name: str, **kwargs, ): """ Create views for available publication dates """ engine = sa.create_engine( 'postgresql+psycopg2://', creator=PostgresHook(postgres_conn_id=target_db).get_conn, ) with engine.begin() as conn: fq_table_name = f'"{schema_name}"."{table_name}"' result_set = conn.execute(f""" select distinct publication_date from {fq_table_name} """) resultproxy = result_set.fetchall() if resultproxy: dates = [[value for column, value in rowproxy.items()][0] for rowproxy in resultproxy] for date in dates: postfix = date.strftime("%b%Y").lower() fq_view_name = f'"{schema_name}"."{table_name}__{postfix}"' logger.info(f'Creating materialized view {fq_view_name}') conn.execute(f""" create materialized view if not exists {fq_view_name} as ( select * from {fq_table_name} where publication_date = '{date.strftime("%Y-%m-%d")}' ) """) fq_view_name = f'"{schema_name}"."{table_name}__latest"' logger.info(f'Creating materialized view {fq_view_name}') conn.execute(f""" create materialized view if not exists {fq_view_name} as ( select * from {fq_table_name} where publication_date = ( select max(publication_date) from {fq_table_name} ) ); refresh materialized view {fq_view_name} """)
def cleanup_old_s3_files(*args, **kwargs): s3 = S3Hook("DEFAULT_S3") bucket = s3.get_bucket(config.S3_IMPORT_DATA_BUCKET) current_time = datetime.strptime(kwargs["ts_nodash"], "%Y%m%dT%H%M%S") logger.info( f"Retention period is {config.S3_RETENTION_PERIOD_DAYS} days before {current_time}" ) pipelines = s3.list_prefixes( config.S3_IMPORT_DATA_BUCKET, prefix="import-data/", delimiter="/" ) for pipeline in pipelines: run_ids = sorted( s3.list_prefixes( config.S3_IMPORT_DATA_BUCKET, prefix=pipeline, delimiter="/" ) ) for run_id in run_ids: run_dt = datetime.strptime(run_id.split("/")[-2], "%Y%m%dT%H%M%S") if run_id == run_ids[-1]: logger.info( f"Keeping {pipeline} run {run_id} ({run_dt}) - always retain the last run." ) elif current_time - run_dt >= timedelta( days=config.S3_RETENTION_PERIOD_DAYS ): logger.info( f"Deleting {pipeline} run {run_id} ({run_dt}) older than retention period" ) bucket.objects.filter(Prefix=run_id).delete() else: logger.info(f"Keeping {pipeline} run {run_id} ({run_dt})")
def get_lines(files): for file, source_name in files: logger.info('Parsing file %s', file) for line in _without_first_and_last(file): data = line.strip().decode('utf-8', errors='replace').split("|") if min_fields <= len(data) < max_fields: yield data + [source_name] else: logger.warn( "Ignoring row with %s fields instead of expected %s: %s", len(data), num_expected_fields, line, )
def get_file_linked_from(url, filename): logger.info('Looking on %s for links to %s', url, filename) html = _download(url) soup = BeautifulSoup(html, "html.parser") links = [link.get('href') for link in soup.find_all('a') if link.get('href')] logger.info("Found links %s", links) matching_links = [link for link in links if link.endswith(filename)] if not matching_links: raise Exception(f'Unable to find link to {filename}') if len(matching_links) > 1: raise Exception(f'Too many links for {filename}') return _download(urljoin(url, matching_links[0]))
def report_metric_per_model(actual, predict, average_type='binary'): precisions = precision_score(actual, predict, average=average_type) recalls = recall_score(actual, predict, average=average_type) f1 = f1_score(actual, predict, average=average_type) accuracy = accuracy_score(actual, predict) auc = roc_auc_score(actual, predict) logger.info(f"Precision = {precisions}") logger.info(f"Recall = {recalls}") logger.info(f"f1 = {f1}") logger.info(f"Accuracy = {accuracy}") # logger.info("AUC = {}".format(roc_auc_score(Y_test_tag, np.concatenate(test_predictions_tag)))) logger.info(f"AUC = {auc}") return precisions, recalls, f1, accuracy, auc
def create_csv( target_db: str, base_file_name: str, timestamp_output: bool, query: str, **kwargs, ): """ Given a db, view name and a query create a csv file and upload it to s3. """ if timestamp_output: file_name = ( f'{base_file_name}-{kwargs["next_execution_date"].strftime("%Y-%m-%d")}.csv' ) else: file_name = f'{base_file_name}.csv' engine = sa.create_engine( 'postgresql+psycopg2://', creator=PostgresHook(postgres_conn_id=target_db).get_conn, echo=config.DEBUG, ) row_count = 0 run_date = kwargs.get('run_date', kwargs.get('execution_date')) with engine.begin() as conn: result = conn.execution_options(stream_results=True).execute( sa.text(query), run_date=run_date.date() ) with tempfile.NamedTemporaryFile('w', encoding='utf8') as fh: writer = csv.writer(fh, quoting=csv.QUOTE_NONNUMERIC) writer.writerow(result.keys()) while True: chunk = result.fetchmany(1000) if not chunk: break row_count += len(chunk) for row in chunk: writer.writerow(row) fh.flush() logger.info(f'Wrote {row_count} rows to file {file_name}') s3_client = S3Hook('DATA_WORKSPACE_S3') s3_output_path = f's3://csv-pipelines/{base_file_name}/{file_name}' s3_client.load_file( fh.name, s3_output_path, bucket_name=config.DATA_WORKSPACE_S3_BUCKET, replace=True, ) logger.info(f"Uploaded {file_name} to {s3_output_path}")
def drop_swap_tables(target_db: str, *tables, **kwargs): """Delete temporary swap dataset DB tables. Given a dataset table `table`, deletes any related swap tables containing the previous version of the dataset. """ engine = sa.create_engine( 'postgresql+psycopg2://', creator=PostgresHook(postgres_conn_id=target_db).get_conn, ) with engine.begin() as conn: conn.execute("SET statement_timeout = 600000") for table in tables: swap_table = get_temp_table(table, kwargs["ts_nodash"] + "_swap") logger.info("Removing %s", swap_table.name) swap_table.drop(conn, checkfirst=True)
def drop_temp_tables(target_db: str, *tables, **kwargs): """Delete temporary dataset DB tables. Given a dataset table `table`, deletes any related temporary tables created during the DAG run. """ engine = sa.create_engine( 'postgresql+psycopg2://', creator=PostgresHook(postgres_conn_id=target_db).get_conn, ) with engine.begin() as conn: conn.execute("SET statement_timeout = 600000") for table in tables: temp_table = get_temp_table(table, kwargs["ts_nodash"]) logger.info("Removing %s", temp_table.name) temp_table.drop(conn, checkfirst=True)
def _fetch(s3, trade_type, expected_keys): years = _get_years(trade_type) def paginate(items, num_per_page): page = [] for item in items: page.append(item) if len(page) == num_per_page: yield page page = [] if page: yield page def file_from_zip(zip_bytes): with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive: name = archive.namelist()[0] logger.info('Opening csv file %s in zip', name) with archive.open(name, "r") as file: yield from file def get_files(trade_type, expected_keys, periods): frequency = 'A' classification = 'HS' if trade_type == 'C' else 'EB02' for period in periods: yield file_from_zip( _download( f'https://comtrade.un.org/api/get/bulk/{trade_type}/{frequency}/{period}/all/{classification}', params=(('token', config.COMTRADE_TOKEN), ), ).content) def get_dicts(f): for f in files: for row in csv.DictReader(codecs.iterdecode(f, 'utf-8-sig')): if list(row.keys()) != expected_keys: raise Exception('Unexpected columns {}'.format(row.keys())) yield {k: v if v else None for k, v in row.items()} files = get_files(trade_type, expected_keys, years) result_records = get_dicts(files) results_pages = paginate(result_records, 10000) for i, page in enumerate(results_pages): output_filename = f"{i:010}.json" logger.info('Saving file to S3 %s', output_filename) s3.write_key(output_filename, page)
def _predict(X_to_predict, tokenizer, tags_to_predict, model_path): import tensorflow as tf logger.info("Start making prediction") ids = X_to_predict['id'] X_to_predict = X_to_predict['sentence'] text_to_predict = X_to_predict.copy() X_to_predict = transform_X(X_to_predict.values, tokenizer) Y_test_predict = np.zeros((X_to_predict.shape[0], len(tags_to_predict))) Y_test_predict_prob = np.zeros( (X_to_predict.shape[0], len(tags_to_predict))) for ind, tag_i in enumerate( ['_'.join(j.split(' ')) for j in tags_to_predict]): logger.info(f"Predicting for tag {ind}, {tag_i}") m = tf.keras.models.load_model(model_path + tag_i) test_predictions_prob_tag = m.predict(X_to_predict) test_predictions_class_tag = (test_predictions_prob_tag > probability_threshold) + 0 Y_test_predict_prob[:, ind] = np.concatenate(test_predictions_prob_tag) Y_test_predict[:, ind] = np.concatenate(test_predictions_class_tag) predict = [] sentence = [] predict_prob = [] for i in np.arange(0, X_to_predict.shape[0]): sentence.append(X_to_predict[i]) predict.append( list( compress(tags_to_predict, Y_test_predict_prob[i] > probability_threshold))) predict_prob.append(dict(zip(tags_to_predict, Y_test_predict_prob[i]))) prediction_on_data = pd.DataFrame({ 'id': ids, 'sentence': text_to_predict, 'prediction': predict, 'prediction_prob': predict_prob, }) return prediction_on_data
def make_prediction(target_db: str, query: str, table_name, **context): with TemporaryDirectory() as tempdir: os.chdir(tempdir) os.mkdir('the_models') os.chdir(tempdir + '/the_models') logger.info(f"working dir: {os.getcwd()}") logger.info("step 1: fetch model") tags_to_predict = fetch_model() logger.info("step 2: fetch data") df = fetch_interaction_data(target_db, query) logger.info("step 3: make prediction") predictions = predict_tags(df, tags_to_predict) logger.info("step 4: write prediction to S3") write_prediction(table_name, predictions, context)
def cleanup_old_datasets_db_tables(*args, **kwargs): engine = sa.create_engine( 'postgresql+psycopg2://', creator=PostgresHook(postgres_conn_id=config.DATASETS_DB_NAME).get_conn, ) current_time = datetime.strptime(kwargs["ts_nodash"], "%Y%m%dT%H%M%S") logger.info( f"Retention period is {config.DB_TEMP_TABLE_RETENTION_PERIOD_DAYS} days before {current_time}" ) with engine.begin() as conn: tables = [ table for table in conn.execute( ''' SELECT schemaname, tablename FROM pg_catalog.pg_tables WHERE schemaname NOT IN ('dataflow', 'information_schema') AND schemaname NOT LIKE '\\_%%' AND schemaname NOT LIKE 'pg_%%' ''' ) ] for table in tables: schema, table_name = table table_match = re.match(r"(.*)_(\d{8}t\d{6})(?:_swap)?", table_name) if not table_match: logger.info(f"Skipping {schema}.{table_name}") continue table_dt = datetime.strptime(table_match.groups()[1], "%Y%m%dt%H%M%S") if current_time - table_dt >= timedelta( days=config.DB_TEMP_TABLE_RETENTION_PERIOD_DAYS ): if table_match.groups()[0] not in [table[1] for table in tables]: logger.warning( f"Main table {table_match.groups()[0]} missing for {schema}.{table_name}, skipping" ) else: logger.info( f"Deleting temporary table {schema}.{table_name} ({table_dt}) older than retention period" ) conn.execute( "DROP TABLE {}.{}".format( engine.dialect.identifier_preparer.quote(schema), engine.dialect.identifier_preparer.quote(table_name), ) ) else: logger.info(f"Keeping table {schema}.{table_name}")
def update_table(target_db: str, target_table: sa.Table, update_query: str, **kwargs): """ Run a query to update an existing table from a temporary table. """ engine = sa.create_engine( 'postgresql+psycopg2://', creator=PostgresHook(postgres_conn_id=target_db).get_conn, ) with engine.begin() as conn: from_table = get_temp_table(target_table, kwargs["ts_nodash"]) logger.info(f'Updating {target_table.name} from {from_table.name}') conn.execute( update_query.format( schema=engine.dialect.identifier_preparer.quote(target_table.schema), target_table=engine.dialect.identifier_preparer.quote( target_table.name ), from_table=engine.dialect.identifier_preparer.quote(from_table.name), ) )
def fetch_companies_house_companies( table_name: str, source_url: str, number_of_files: int, page_size: int = 10000, **kwargs, ): """ Loop through `number_of_files`, build the url, download the zip file, extract and write data in batches of `page_size` to s3 """ s3 = S3Data(table_name, kwargs['ts_nodash']) page = 1 results = [] publish_date = datetime(kwargs['next_execution_date'].year, kwargs['next_execution_date'].month, 1).strftime('%Y-%m-01') for file_num in range(1, number_of_files + 1): url = source_url.format( file_date=publish_date, file_num=file_num, num_files=number_of_files, ) logger.info('Fetching zip file from %s', url) with zipfile.ZipFile(io.BytesIO(_download(url))) as archive: with archive.open(archive.namelist()[0], 'r') as f: reader = csv.DictReader(codecs.iterdecode(f, 'utf-8')) if reader.fieldnames is not None: reader.fieldnames = [x.strip() for x in reader.fieldnames] for row in reader: row['publish_date'] = publish_date results.append(row) if len(results) >= page_size: s3.write_key(f'{page:010}.json', results) results = [] page += 1 if results: s3.write_key(f'{page:010}.json', results) logger.info('Fetching from source completed')
def fetch_from_gtr_api(table_name: str, resource_type: str, **kwargs): source_url = 'https://gtr.ukri.org/gtr/api' s3 = S3Data(table_name, kwargs["ts_nodash"]) page = 1 while True: response = requests.get( f'{source_url}/{resource_type}s', params={ 'p': page, 's': 100 }, headers={'Accept': 'application/json'}, ) try: response.raise_for_status() except requests.exceptions.HTTPError: logger.error(f"Request failed: {response.text}") raise response_json = response.json() total_pages = response_json['totalPages'] total_number_of_results = response_json['totalSize'] results = response_json[resource_type] s3.write_key(f"{page:010}.json", results) logger.info( f"Fetched {len(results*page)} out of {total_number_of_results} {resource_type} records" ) page += 1 if page > total_pages: break logger.info("Fetching from source completed")
def branch_on_modified_date(target_db: str, table_config: TableConfig, **context): engine = sa.create_engine( 'postgresql+psycopg2://', creator=PostgresHook(postgres_conn_id=target_db).get_conn, ) with engine.begin() as conn: res = conn.execute( """ SELECT source_data_modified_utc FROM dataflow.metadata WHERE table_schema = %s and table_name = %s """, [table_config.schema, table_config.table_name], ).fetchall() if len(res) == 0: return 'continue' if len(res) > 1: raise AirflowException( f"Multiple rows in the dataflow metadata table for {table_config.schema}.{table_config.table_name}" ) if not res[0][0]: return 'continue' old_modified_utc = res[0][0] new_modified_utc = context['task_instance'].xcom_pull( task_ids='get-source-modified-date') context['task_instance'].xcom_push('source-modified-date-utc', new_modified_utc) logger.info("Old: %s. New: %s", old_modified_utc, new_modified_utc) if new_modified_utc > old_modified_utc: return 'continue' return 'stop'
def nested_files_from_zip(zip_bytes): with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive: for name in archive.namelist(): if not name.lower().startswith(base_filename): # Some sesx16 files seem to contain unrelated data logger.info('Skipping file %s', name) continue logger.info('Opening file in zip %s', name) with archive.open(name, "r") as file: with zipfile.ZipFile(file) as inner_archive: inner_name = inner_archive.namelist()[0] logger.info('Opening inner file in zip %s', inner_name) with inner_archive.open(inner_name, "r") as inner_file: logger.info('Opened inner file in zip %s', inner_name) yield inner_file, inner_name
def scrape_load_and_check_data( target_db: str, table_config: TableConfig, pipeline_instance: "_PandasPipelineWithPollingSupport", **kwargs, ): create_temp_tables(target_db, *table_config.tables, **kwargs) temp_table = get_temp_table(table_config.table, suffix=kwargs['ts_nodash']) data_frames = pipeline_instance.__class__.data_getter() parsed_uri = urlparse(os.environ['AIRFLOW_CONN_DATASETS_DB']) host, port, dbname, user, password = ( parsed_uri.hostname, parsed_uri.port or 5432, parsed_uri.path.strip('/'), parsed_uri.username, parsed_uri.password, ) # Psycopg3 is still under active development, but crucially has support for generating data and pushing it to # postgres efficiently via `cursor.copy` and the COPY protocol. with psycopg3.connect( f'host={host} port={port} dbname={dbname} user={user} password={password}' ) as connection: with connection.cursor() as cursor: logger.info("Starting streaming copy to DB") records_num = 0 df_num = 0 with cursor.copy( f'COPY "{temp_table.schema}"."{temp_table.name}" FROM STDIN' ) as copy: for data_frame in data_frames: df_num += 1 df_len = len(data_frame) records_num += df_len logger.info( "Copying data frame #%s (records %s - %s)", df_num, records_num - df_len, records_num, ) copy.write( data_frame.to_csv( index=False, header=False, sep='\t', na_rep=r'\N', columns=[ data_column for data_column, sa_column in table_config.columns ], )) del data_frame logger.info("Copy complete.")
def create_temp_tables(target_db: str, *tables: sa.Table, **kwargs): """ Create a temporary table for the current DAG run for each of the given dataset tables. Table names are unique for each DAG run and use target table name as a prefix and current DAG execution timestamp as a suffix. """ engine = sa.create_engine( 'postgresql+psycopg2://', creator=PostgresHook(postgres_conn_id=target_db).get_conn, ) with engine.begin() as conn: conn.execute("SET statement_timeout = 600000") for table in tables: table = get_temp_table(table, kwargs["ts_nodash"]) logger.info("Creating schema %s if not exists", table.schema) conn.execute(f"CREATE SCHEMA IF NOT EXISTS {table.schema}") logger.info("Creating %s", table.name) table.create(conn, checkfirst=True)
def fetch_from_hawk_api( table_name: str, source_url: str, hawk_credentials: dict, results_key: str = "results", next_key: Optional[str] = "next", validate_response: Optional[bool] = True, force_http: Optional[bool] = False, **kwargs, ): s3 = S3Data(table_name, kwargs["ts_nodash"]) total_records = 0 page = 1 while True: data = _hawk_api_request( source_url, credentials=hawk_credentials, results_key=results_key, next_key=next_key, validate_response=validate_response, force_http=force_http, ) results = get_nested_key(data, results_key) s3.write_key(f"{page:010}.json", results) total_records += len(results) logger.info(f"Fetched {total_records} records") source_url = get_nested_key(data, next_key) if next_key else None if not source_url: break page += 1 logger.info("Fetching from source completed")
def fetch_mapped_hosted_csvs( table_name: str, source_urls: Dict[str, str], df_transform: Callable[[pd.DataFrame], pd.DataFrame], page_size: int = 10000, **kwargs, ): s3 = S3Data(table_name, kwargs["ts_nodash"]) page = 1 for type_, source_url in source_urls.items(): logger.info(f"Fetching {source_url}") response = requests.get(source_url) df = pd.read_csv(io.StringIO(response.content.decode('utf-8'))) df = df_transform(df) df["source_url_key"] = type_ for i in range((len(df) // page_size) + 1): results = df.iloc[page_size * i:page_size * (i + 1)].to_json( orient="records", date_format="iso") s3.write_key(f"{page:010}.json", results, jsonify=False) page += 1 logger.info("Fetching from source completed")