def transform(self, s3_source, s3_dest, job_id): objects = s3.get_keys_with_prefix(s3_source) if len(objects) == 0: message = f'Zero objects found in {s3_source}. Quitting.' app.logger.info(message) return message app.logger.info(f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.') objects_updated = 0 new_objects = 0 objects_in_error = 0 total_objects = 0 for o in objects: file_name = o.split('/')[-1] app.logger.debug(f'processing {file_name}') # file_name is like 'daily_2020-08-14.zip' piazza_zip_file = s3.get_object_compressed_text_reader(o) for subfile in piazza_zip_file.namelist(): if '.json' in subfile: try: json_file = subfile.split('/')[-1] course_id = subfile.split('/')[-2] file_type = json_file.split('_')[0] record = piazza_zip_file.read(subfile) with tempfile.TemporaryFile() as result: s3_object = f'{s3_dest}/{file_type}/{course_id}/{json_file}' if s3.object_exists(s3_object): objects_updated += 1 else: new_objects += 1 result.write(record) s3.upload_file(result, s3_object) total_objects += 1 # update job queue every 1000 files... if total_objects % 1000 == 0: message = f'{subfile}, {total_objects} so far; ' \ + f'{new_objects} new files; ' \ + f'{objects_updated} existing files. {objects_in_error} files in error' \ + f'({len(objects)} objects in all)' update_background_job_status(job_id, 'transforming', details=message) except Exception as e: app.logger.error(f'could not extract {subfile}') app.logger.error(e) objects_in_error += 1 else: # not a json file, so we skip it continue message = f'Transformed {len(objects)} input files; created {new_objects} new objects; '\ + f'updated {objects_updated} existing objects. {objects_in_error} objects in error' app.logger.info(message) return message
def _upload_file_to_staging(self, table, _file): tsv_filename = f'staging_{table}.tsv' s3_key = f'{get_s3_edl_daily_path()}/{tsv_filename}' app.logger.info(f'Will stash {table} feeds in S3: {s3_key}') if not s3.upload_file(_file, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.copy_tsv_from_s3(f'{self.internal_schema}.{table}', s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.')
def _upload_file_to_staging(table, _file): tsv_filename = f'staging_{table}.tsv' s3_key = f'{get_s3_edl_daily_path()}/{tsv_filename}' app.logger.info(f'Will stash {table} feeds in S3: {s3_key}') if not s3.upload_file(_file, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') if not redshift.copy_tsv_from_s3( f"{app.config['REDSHIFT_SCHEMA_EDL']}.{table}", s3_key): raise BackgroundJobError('Error on Redshift copy: aborting job.')
def transform(self, s3_source, s3_dest, key=None): objects = s3.get_keys_with_prefix(s3_source) app.logger.info( f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.' ) skip_count = 0 for o in objects: file_name = o.split('/')[-1] if s3.object_exists(f'{s3_dest}/{file_name}'): skip_count += 1 continue canvas_api_data = s3.get_object_json(o).get( key) if key else s3.get_object_json(o) with tempfile.TemporaryFile() as result: course_id = int(file_name.split('_')[-2]) for record in canvas_api_data: record['course_id'] = course_id result.write(json.dumps(record).encode() + b'\n') s3.upload_file(result, f'{s3_dest}/{file_name}') app.logger.info( f'Transformed {len(objects) - skip_count} new objects; skipped {skip_count} existing objects.' )
def upload_file_to_staging(table, term_file, row_count, term_id): if term_id: tsv_filename = f'staging_{table}_{term_id}.tsv' else: tsv_filename = f'staging_{table}.tsv' s3_key = f'{get_s3_sis_api_daily_path()}/{tsv_filename}' app.logger.info(f'Will stash {row_count} feeds in S3: {s3_key}') if not s3.upload_file(term_file, s3_key): raise BackgroundJobError('Error on S3 upload: aborting job.') app.logger.info('Will copy S3 feeds into Redshift...') query = resolve_sql_template_string( """ COPY {staging_schema}.{table} FROM '{loch_s3_sis_api_data_path}/{tsv_filename}' IAM_ROLE '{redshift_iam_role}' DELIMITER '\\t'; """, staging_schema=staging_schema(), table=table, tsv_filename=tsv_filename, ) if not redshift.execute(query): raise BackgroundJobError('Error on Redshift copy: aborting job.')