def transform(self, s3_source, s3_dest, job_id):
     objects = s3.get_keys_with_prefix(s3_source)
     if len(objects) == 0:
         message = f'Zero objects found in {s3_source}. Quitting.'
         app.logger.info(message)
         return message
     app.logger.info(f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.')
     objects_updated = 0
     new_objects = 0
     objects_in_error = 0
     total_objects = 0
     for o in objects:
         file_name = o.split('/')[-1]
         app.logger.debug(f'processing {file_name}')
         # file_name is like 'daily_2020-08-14.zip'
         piazza_zip_file = s3.get_object_compressed_text_reader(o)
         for subfile in piazza_zip_file.namelist():
             if '.json' in subfile:
                 try:
                     json_file = subfile.split('/')[-1]
                     course_id = subfile.split('/')[-2]
                     file_type = json_file.split('_')[0]
                     record = piazza_zip_file.read(subfile)
                     with tempfile.TemporaryFile() as result:
                         s3_object = f'{s3_dest}/{file_type}/{course_id}/{json_file}'
                         if s3.object_exists(s3_object):
                             objects_updated += 1
                         else:
                             new_objects += 1
                         result.write(record)
                         s3.upload_file(result, s3_object)
                         total_objects += 1
                     # update job queue every 1000 files...
                     if total_objects % 1000 == 0:
                         message = f'{subfile}, {total_objects} so far; ' \
                                   + f'{new_objects} new files; ' \
                                   + f'{objects_updated} existing files. {objects_in_error} files in error' \
                                   + f'({len(objects)} objects in all)'
                         update_background_job_status(job_id, 'transforming', details=message)
                 except Exception as e:
                     app.logger.error(f'could not extract {subfile}')
                     app.logger.error(e)
                     objects_in_error += 1
             else:
                 # not a json file, so we skip it
                 continue
     message = f'Transformed {len(objects)} input files; created {new_objects} new objects; '\
               + f'updated {objects_updated} existing objects. {objects_in_error} objects in error'
     app.logger.info(message)
     return message
Exemplo n.º 2
0
    def _upload_file_to_staging(self, table, _file):
        tsv_filename = f'staging_{table}.tsv'
        s3_key = f'{get_s3_edl_daily_path()}/{tsv_filename}'

        app.logger.info(f'Will stash {table} feeds in S3: {s3_key}')
        if not s3.upload_file(_file, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.copy_tsv_from_s3(f'{self.internal_schema}.{table}', s3_key):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')
Exemplo n.º 3
0
def _upload_file_to_staging(table, _file):
    tsv_filename = f'staging_{table}.tsv'
    s3_key = f'{get_s3_edl_daily_path()}/{tsv_filename}'

    app.logger.info(f'Will stash {table} feeds in S3: {s3_key}')
    if not s3.upload_file(_file, s3_key):
        raise BackgroundJobError('Error on S3 upload: aborting job.')

    app.logger.info('Will copy S3 feeds into Redshift...')
    if not redshift.copy_tsv_from_s3(
            f"{app.config['REDSHIFT_SCHEMA_EDL']}.{table}", s3_key):
        raise BackgroundJobError('Error on Redshift copy: aborting job.')
Exemplo n.º 4
0
 def transform(self, s3_source, s3_dest, key=None):
     objects = s3.get_keys_with_prefix(s3_source)
     app.logger.info(
         f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.'
     )
     skip_count = 0
     for o in objects:
         file_name = o.split('/')[-1]
         if s3.object_exists(f'{s3_dest}/{file_name}'):
             skip_count += 1
             continue
         canvas_api_data = s3.get_object_json(o).get(
             key) if key else s3.get_object_json(o)
         with tempfile.TemporaryFile() as result:
             course_id = int(file_name.split('_')[-2])
             for record in canvas_api_data:
                 record['course_id'] = course_id
                 result.write(json.dumps(record).encode() + b'\n')
             s3.upload_file(result, f'{s3_dest}/{file_name}')
     app.logger.info(
         f'Transformed {len(objects) - skip_count} new objects; skipped {skip_count} existing objects.'
     )
Exemplo n.º 5
0
def upload_file_to_staging(table, term_file, row_count, term_id):
    if term_id:
        tsv_filename = f'staging_{table}_{term_id}.tsv'
    else:
        tsv_filename = f'staging_{table}.tsv'
    s3_key = f'{get_s3_sis_api_daily_path()}/{tsv_filename}'
    app.logger.info(f'Will stash {row_count} feeds in S3: {s3_key}')
    if not s3.upload_file(term_file, s3_key):
        raise BackgroundJobError('Error on S3 upload: aborting job.')

    app.logger.info('Will copy S3 feeds into Redshift...')
    query = resolve_sql_template_string(
        """
        COPY {staging_schema}.{table}
            FROM '{loch_s3_sis_api_data_path}/{tsv_filename}'
            IAM_ROLE '{redshift_iam_role}'
            DELIMITER '\\t';
        """,
        staging_schema=staging_schema(),
        table=table,
        tsv_filename=tsv_filename,
    )
    if not redshift.execute(query):
        raise BackgroundJobError('Error on Redshift copy: aborting job.')