def move_files_to_s3(s3helper, directory_name): for file_name in glob.glob(os.path.join(directory_name, '*')): (root_directory_name, tail_directory_name) = os.path.split(directory_name) (root_file_name, tail_file_name) = os.path.split(file_name) key = get_stats_archive_key(correct(tail_directory_name), tail_file_name) LOG.info('Adding {0} to {1}'.format(file_name, key)) s3helper.add_file_to_bucket(get_archive_bucket(), key, file_name) shutil.rmtree(directory_name, ignore_errors=True)
def access_s3(): """ Check we can access the archive bucket :return: """ try: s3helper = S3Helper() bucket = s3helper.get_bucket(get_archive_bucket()) LOG.info('Access S3 bucket name: {0}'.format(bucket.name)) except Exception: LOG.exception('check_database_connection') return False return True
def access_s3(): """ Check we can access the archive bucket :return: """ try: s3helper = S3Helper() bucket = s3helper.get_bucket(get_archive_bucket()) LOG.info("Access S3 bucket name: {0}".format(bucket.name)) except Exception: LOG.exception("check_database_connection") return False return True
def access_s3(): """ Check we can access the archive bucket :return: """ try: LOG.info('Testing S3 access') s3helper = S3Helper() bucket = s3helper.get_bucket(get_archive_bucket()) LOG.info('Access S3 bucket name: {0}'.format(bucket.name)) except Exception: LOG.exception('access_s3') return False return True
args = vars(parser.parse_args()) if args['option'] == 'boinc': LOG.info('PYTHONPATH = {0}'.format(sys.path)) # We're running from the BOINC server process_boinc() else: # We're running from a specially created AMI filename, full_filename = get_ami_log_file('archive_boinc_stats') add_file_handler_to_root(full_filename) LOG.info('PYTHONPATH = {0}'.format(sys.path)) LOG.info('About to perform sanity checks') if pass_sanity_checks(): process_ami() else: LOG.error('Failed to pass sanity tests') # Try copying the log file to S3 try: LOG.info('About to copy the log file') s3helper = S3Helper() s3helper.add_file_to_bucket(get_archive_bucket(), get_log_archive_key('archive_boinc_stats', filename), full_filename, True) os.remove(full_filename) except: LOG.exception('Failed to copy the log file') ec2_helper = EC2Helper() ec2_helper.release_public_ip() LOG.INFO('All done')
parser.add_argument('option', choices=['boinc', 'ami'], help='are we running on the BOINC server or the AMI server') args = vars(parser.parse_args()) if args['option'] == 'boinc': LOG.info('PYTHONPATH = {0}'.format(sys.path)) # We're running from the BOINC server original_image_checked_boinc() else: # We're running from a specially created AMI log_name = 'original_image_checked' filename, full_filename = get_ami_log_file(log_name) add_file_handler_to_root(full_filename) LOG.info('PYTHONPATH = {0}'.format(sys.path)) LOG.info('About to perform sanity checks') if pass_sanity_checks(): original_image_checked_ami() else: LOG.error('Failed to pass sanity tests') # Try copying the log file to S3 try: LOG.info('About to copy the log file') s3helper = S3Helper() s3helper.add_file_to_bucket(get_archive_bucket(), get_log_archive_key(log_name, filename), full_filename, True) os.remove(full_filename) except: LOG.exception('Failed to copy the log file') LOG.INFO('All done')
def get_data(output_directory): """ Get the stats from the S3 archive and build the csv files :param output_directory: where to store the files :return: """ done_dates = get_done_dates() # Now get ready to load the files keys_being_restored = [] s3helper = S3Helper() bucket = s3helper.get_bucket(get_archive_bucket()) set_filenames = set() for prefix in bucket.list(prefix='stats/', delimiter='/'): elements = prefix.name.split('/') elements = elements[1].split('_') date_file = date(int(elements[1]), int(elements[2]), int(elements[3])) if date_file not in done_dates: stats_file = '{0}_{1}_{2}_user.gz'.format(elements[1], elements[2], elements[3]) full_filename = os.path.join(output_directory, stats_file) if full_filename in set_filenames: # Ignore pass elif not os.path.exists(full_filename) or os.path.getsize( full_filename) == 9: set_filenames.add(full_filename) key = bucket.get_key(os.path.join(prefix.name, 'user.gz')) if key is not None: if key.ongoing_restore or key.storage_class == 'GLACIER': LOG.info('Restoring {0}'.format(key.name)) # We need retrieve it if not key.ongoing_restore: key.restore(days=5) keys_being_restored.append([key.name, full_filename]) # Put an empty file in the directory if not os.path.exists(full_filename): output_file = open(full_filename, "wb") output_file.write('Restoring') output_file.close() else: # Put the file in the storage area LOG.info('Fetching {0}'.format(key.name)) key.get_contents_to_filename(full_filename) # Now we have to wait for all the files we need to be restored for key_pair in keys_being_restored: key = bucket.get_key(key_pair[0]) if key.ongoing_restore: time.sleep(300) else: # The file has been restored so copy it LOG.info('Fetching {0}'.format(key_pair[0])) key.get_contents_to_filename(key_pair[1]) # Build the prepared statements insert_usage = USAGE.insert() insert_individual = INDIVIDUAL.insert() # Now build up the list of filenames for file_name in glob.glob(os.path.join(output_directory, '*_user.gz')): (head, tail) = os.path.split(file_name) elements = tail.split('_') date_file = date(int(elements[0]), int(elements[1]), int(elements[2])) if date_file not in done_dates: # Read the contents LOG.info('Processing {0}'.format(file_name)) gzip_file = gzip.open(file_name, 'rb') contents = gzip_file.read() gzip_file.close() # Extract the XML data root = ET.fromstring(contents) # Initialise gflops = 0.0 active_users = 0 registered_users = 0 transaction = connection.begin() # The users are in a random order for user in root: user_id = user.find('id').text user_id = int(user_id) expavg_credit = user.find('expavg_credit').text expavg_credit = float(expavg_credit) connection.execute(insert_individual, date=date_file, user_id=user_id, expavg_credit=expavg_credit) registered_users += 1 if expavg_credit > 1: active_users += 1 gflops += expavg_credit connection.execute(insert_usage, date=date_file, gflops=gflops / COBBLESTONE_FACTOR, active_users=active_users, registered_users=registered_users) transaction.commit()
args = vars(parser.parse_args()) if args['option'] == 'boinc': LOG.info('PYTHONPATH = {0}'.format(sys.path)) # We're running from the BOINC server original_image_checked_boinc() else: # We're running from a specially created AMI log_name = 'original_image_checked' filename, full_filename = get_ami_log_file(log_name) add_file_handler_to_root(full_filename) LOG.info('PYTHONPATH = {0}'.format(sys.path)) LOG.info('About to perform sanity checks') if pass_sanity_checks(): original_image_checked_ami() else: LOG.error('Failed to pass sanity tests') # Try copying the log file to S3 try: LOG.info('About to copy the log file') s3helper = S3Helper() s3helper.add_file_to_bucket(get_archive_bucket(), get_log_archive_key(log_name, filename), full_filename, True) os.remove(full_filename) except: LOG.exception('Failed to copy the log file') LOG.INFO('All done')
def get_data(output_directory): """ Get the stats from the S3 archive and build the csv files :param output_directory: where to store the files :return: """ done_dates = get_done_dates() # Now get ready to load the files keys_being_restored = [] s3helper = S3Helper() bucket = s3helper.get_bucket(get_archive_bucket()) set_filenames = set() for prefix in bucket.list(prefix='stats/', delimiter='/'): elements = prefix.name.split('/') elements = elements[1].split('_') date_file = date(int(elements[1]), int(elements[2]), int(elements[3])) if date_file not in done_dates: stats_file = '{0}_{1}_{2}_user.gz'.format(elements[1], elements[2], elements[3]) full_filename = os.path.join(output_directory, stats_file) if full_filename in set_filenames: # Ignore pass elif not os.path.exists(full_filename) or os.path.getsize(full_filename) == 9: set_filenames.add(full_filename) key = bucket.get_key(os.path.join(prefix.name, 'user.gz')) if key is not None: if key.ongoing_restore or key.storage_class == 'GLACIER': LOG.info('Restoring {0}'.format(key.name)) # We need retrieve it if not key.ongoing_restore: key.restore(days=5) keys_being_restored.append([key.name, full_filename]) # Put an empty file in the directory if not os.path.exists(full_filename): output_file = open(full_filename, "wb") output_file.write('Restoring') output_file.close() else: # Put the file in the storage area LOG.info('Fetching {0}'.format(key.name)) key.get_contents_to_filename(full_filename) # Now we have to wait for all the files we need to be restored for key_pair in keys_being_restored: key = bucket.get_key(key_pair[0]) if key.ongoing_restore: time.sleep(300) else: # The file has been restored so copy it LOG.info('Fetching {0}'.format(key_pair[0])) key.get_contents_to_filename(key_pair[1]) # Build the prepared statements insert_usage = USAGE.insert() insert_individual = INDIVIDUAL.insert() # Now build up the list of filenames for file_name in glob.glob(os.path.join(output_directory, '*_user.gz')): (head, tail) = os.path.split(file_name) elements = tail.split('_') date_file = date(int(elements[0]), int(elements[1]), int(elements[2])) if date_file not in done_dates: # Read the contents LOG.info('Processing {0}'.format(file_name)) gzip_file = gzip.open(file_name, 'rb') contents = gzip_file.read() gzip_file.close() # Extract the XML data root = ET.fromstring(contents) # Initialise gflops = 0.0 active_users = 0 registered_users = 0 transaction = connection.begin() # The users are in a random order for user in root: user_id = user.find('id').text user_id = int(user_id) expavg_credit = user.find('expavg_credit').text expavg_credit = float(expavg_credit) connection.execute(insert_individual, date=date_file, user_id=user_id, expavg_credit=expavg_credit) registered_users += 1 if expavg_credit > 1: active_users += 1 gflops += expavg_credit connection.execute(insert_usage, date=date_file, gflops=gflops / COBBLESTONE_FACTOR, active_users=active_users, registered_users=registered_users) transaction.commit()