async def csv_to_records(loop, stdin, skip_incomplete=True): """ :rtype: async generator of records (dict-like) """ async def inventory_by_folder(stdin): previous = None result = [] async for entry in read_csv(stdin): object_key = entry['Key'] folder = os.path.dirname(object_key) if previous is None: previous = folder if previous == folder: result.append(entry) else: yield result previous = folder result = [entry] if result: yield result def deduplicate_entries(entries): # Windows releases are published as both .zip and .exe files. # Deduplicate these (keep .exe only). # Some old Linux versions (1.5b2) were published with installer.tar.gz. filtered = [e for e in entries if e['Key'].endswith('.exe')] if len(filtered) > 0: entries = filtered longer_first = sorted(entries, key=lambda e: len(e['Key']), reverse=True) deduplicate = { e['Key'].lower().replace('+setup+', '-').replace( '.installer.exe', '').replace('.exe', '').replace('.installer.tar.gz', '').replace('.tar.gz', '').replace('.zip', ''): e for e in longer_first } return deduplicate.values() # Read metadata of previous run, and warm up cache. # Will save a lot of hits to archive.mozilla.org. metadata_cache_file = os.path.join(CACHE_FOLDER, '.metadata-{}.json'.format(__version__)) if os.path.exists(metadata_cache_file): metadata = json.load(open(metadata_cache_file)) _rc_metadata.update(metadata['rc']) _release_metadata.update(metadata['release']) _nightly_metadata.update(metadata['nightly']) async with aiohttp.ClientSession(loop=loop) as session: batch = [] async for entries in inventory_by_folder(stdin): entries = deduplicate_entries(entries) for entry in entries: object_key = entry['Key'] try: product = object_key.split('/')[ 1] # /pub/thunderbird/nightly/... except IndexError: continue # e.g. https://archive.mozilla.org/favicon.ico if product not in PRODUCTS: continue # Scan the list of candidates metadata (no-op if already initialized). await scan_candidates(session, product) url = ARCHIVE_URL + object_key.replace('+', ' ') if not is_build_url(product, url): continue try: record = record_from_url(url) except Exception as e: logger.exception(e) continue # Complete with info that can't be obtained from the URL. filesize = int(float(entry['Size'])) # e.g. 2E+10 lastmodified = datetime.datetime.strptime( entry['LastModifiedDate'], '%Y-%m-%dT%H:%M:%S.%fZ') lastmodified = lastmodified.strftime(DATETIME_FORMAT) record['download']['size'] = filesize record['download']['date'] = lastmodified if len(batch) < NB_PARALLEL_REQUESTS: batch.append(record) else: async for result in process_batch(session, batch, skip_incomplete): yield result batch = [] # Go on. # Last loop iteration. async for result in process_batch(session, batch, skip_incomplete): yield result # Save accumulated metadata for next runs. tmpfilename = metadata_cache_file + '.tmp' metadata = { 'rc': _rc_metadata, 'release': _release_metadata, 'nightly': _nightly_metadata, } json.dump(metadata, open(tmpfilename, 'w')) os.rename(tmpfilename, metadata_cache_file)
async def main(loop, event): """ Trigger when S3 event kicks in. http://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html """ server_url = config('SERVER_URL', default='http://localhost:8888/v1') bucket = config('BUCKET', default='build-hub') collection = config('COLLECTION', default='releases') kinto_auth = tuple(config('AUTH', 'user:pass').split(':')) kinto_client = kinto_http.Client(server_url=server_url, auth=kinto_auth, retry=NB_RETRY_REQUEST) records = [] for record in event['Records']: if record.get('EventSource') == 'aws:sns': records.extend(json.loads(record['Sns']['Message'])['Records']) else: records.append(record) async with aiohttp.ClientSession(loop=loop) as session: for event_record in records: metrics.incr('s3_event_event') records_to_create = [] # Use event time as archive publication. event_time = ciso8601.parse_datetime(event_record['eventTime']) event_time = event_time.strftime(utils.DATETIME_FORMAT) key = event_record['s3']['object']['key'] filesize = event_record['s3']['object']['size'] url = utils.key_to_archive_url(key) logger.debug("Event file {}".format(url)) try: product = key.split('/')[1] # /pub/thunderbird/nightly/... except IndexError: continue # e.g. https://archive.mozilla.org/favicon.ico if product not in utils.ALL_PRODUCTS: logger.info('Skip product {}'.format(product)) continue # Release / Nightly / RC archive. if utils.is_build_url(product, url): logger.info('Processing {} archive: {}'.format(product, key)) record = utils.record_from_url(url) # Use S3 event infos for the archive. record['download']['size'] = filesize record['download']['date'] = event_time # Fetch release metadata. await scan_candidates(session, product) logger.debug("Fetch record metadata") # metadata = await fetch_metadata(session, record) metadata = await fetch_metadata(session, record) # If JSON metadata not available, archive will be # handled when JSON is delivered. if metadata is None: logger.info(f"JSON metadata not available {record['id']}") continue # Merge obtained metadata. record = utils.merge_metadata(record, metadata) records_to_create.append(record) # RC metadata elif utils.is_rc_build_metadata(product, url): logger.info(f'Processing {product} RC metadata: {key}') # pub/firefox/candidates/55.0b12-candidates/build1/mac/en-US/ # firefox-55.0b12.json logger.debug("Fetch new metadata") # It has been known to happen that right after an S3 Event # there's a slight delay to the metadata json file being # available. If that's the case we want to retry in a couple # of seconds to see if it's available on the next backoff # attempt. metadata = await fetch_json(session, url, retry_on_notfound=True) metadata['buildnumber'] = int( re.search('/build(\d+)/', url).group(1)) # We just received the metadata file. Lookup if the associated # archives are here too. archives = [] if 'multi' in url: # For multi we just check the associated archive # is here already. parent_folder = re.sub('multi/.+$', 'multi/', url) _, files = await fetch_listing(session, parent_folder, retry_on_notfound=True) for f in files: rc_url = parent_folder + f['name'] if utils.is_build_url(product, rc_url): archives.append( (rc_url, f['size'], f['last_modified'])) else: # For en-US it's different, it applies to every # localized archives. # Check if they are here by listing the parent folder # (including en-US archive). l10n_parent_url = re.sub('en-US/.+$', '', url) l10n_folders, _ = await fetch_listing( session, l10n_parent_url, retry_on_notfound=True, ) for locale in l10n_folders: _, files = await fetch_listing( session, l10n_parent_url + locale, retry_on_notfound=True, ) for f in files: rc_url = l10n_parent_url + locale + f['name'] if utils.is_build_url(product, rc_url): archives.append(( rc_url, f['size'], f['last_modified'], )) for rc_url, size, last_modified in archives: record = utils.record_from_url(rc_url) record['download']['size'] = size record['download']['date'] = last_modified record = utils.merge_metadata(record, metadata) records_to_create.append(record) # Theorically release should never be there yet :) # And repacks like EME-free/sha1 don't seem to be # published in RC. # Nightly metadata # pub/firefox/nightly/2017/08/2017-08-08-11-40-32-mozilla-central/ # firefox-57.0a1.en-US.linux-i686.json # -l10n/... elif utils.is_nightly_build_metadata(product, url): logger.info(f'Processing {product} nightly metadata: {key}') logger.debug("Fetch new nightly metadata") # See comment above about the exceptional need of # setting retry_on_notfound here. metadata = await fetch_json(session, url, retry_on_notfound=True) platform = metadata['moz_pkg_platform'] # Check if english version is here. parent_url = re.sub('/[^/]+$', '/', url) logger.debug("Fetch parent listing {}".format(parent_url)) _, files = await fetch_listing(session, parent_url) for f in files: if ('.' + platform + '.') not in f['name']: # metadata are by platform. continue en_nightly_url = parent_url + f['name'] if utils.is_build_url(product, en_nightly_url): record = utils.record_from_url(en_nightly_url) record['download']['size'] = f['size'] record['download']['date'] = f['last_modified'] record = utils.merge_metadata(record, metadata) records_to_create.append(record) break # Only one file for english. # Check also localized versions. l10n_folder_url = re.sub('-mozilla-central([^/]*)/([^/]+)$', '-mozilla-central\\1-l10n/', url) logger.debug("Fetch l10n listing {}".format(l10n_folder_url)) try: _, files = await fetch_listing( session, l10n_folder_url, retry_on_notfound=True, ) except ValueError: files = [] # No -l10/ folder published yet. for f in files: if (('.' + platform + '.') not in f['name'] and product != 'mobile'): # metadata are by platform. # (mobile platforms are contained by folder) continue nightly_url = l10n_folder_url + f['name'] if utils.is_build_url(product, nightly_url): record = utils.record_from_url(nightly_url) record['download']['size'] = f['size'] record['download']['date'] = f['last_modified'] record = utils.merge_metadata(record, metadata) records_to_create.append(record) else: logger.info('Ignored {}'.format(key)) logger.debug(f"{len(records_to_create)} records to create.") with metrics.timer('s3_event_records_to_create'): for record in records_to_create: # Check that fields values look OK. utils.check_record(record) # Push result to Kinto. kinto_client.create_record(data=record, bucket=bucket, collection=collection, if_not_exists=True) logger.info('Created {}'.format(record['id'])) metrics.incr('s3_event_record_created')
async def main(loop, event): """ Trigger when S3 event kicks in. http://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html """ server_url = os.getenv('SERVER_URL', 'http://localhost:8888/v1') bucket = os.getenv('BUCKET', 'build-hub') collection = os.getenv('COLLECTION', 'releases') kinto_auth = tuple(os.getenv('AUTH', 'user:pass').split(':')) kinto_client = kinto_http.Client(server_url=server_url, auth=kinto_auth, retry=NB_RETRY_REQUEST) records = [] for record in event['Records']: if record.get('EventSource') == 'aws:sns': records.extend(json.loads(record['Sns']['Message'])['Records']) else: records.append(record) async with aiohttp.ClientSession(loop=loop) as session: for event_record in records: records_to_create = [] # Use event time as archive publication. event_time = datetime.datetime.strptime(event_record['eventTime'], '%Y-%m-%dT%H:%M:%S.%fZ') event_time = event_time.strftime(utils.DATETIME_FORMAT) key = event_record['s3']['object']['key'] filesize = event_record['s3']['object']['size'] url = utils.ARCHIVE_URL + key logger.debug("Event file {}".format(url)) try: product = key.split('/')[1] # /pub/thunderbird/nightly/... except IndexError: continue # e.g. https://archive.mozilla.org/favicon.ico if product not in utils.ALL_PRODUCTS: logger.info('Skip product {}'.format(product)) continue # Release / Nightly / RC archive. if utils.is_build_url(product, url): logger.info('Processing {} archive: {}'.format(product, key)) record = utils.record_from_url(url) # Use S3 event infos for the archive. record['download']['size'] = filesize record['download']['date'] = event_time # Fetch release metadata. await scan_candidates(session, product) logger.debug("Fetch record metadata") metadata = await fetch_metadata(session, record) # If JSON metadata not available, archive will be handled when JSON # is delivered. if metadata is None: logger.info('JSON metadata not available {}'.format( record['id'])) continue # Merge obtained metadata. record = utils.merge_metadata(record, metadata) records_to_create.append(record) # RC metadata elif utils.is_rc_build_metadata(product, url): logger.info('Processing {} RC metadata: {}'.format( product, key)) # pub/firefox/candidates/55.0b12-candidates/build1/mac/en-US/ # firefox-55.0b12.json logger.debug("Fetch new metadata") metadata = await fetch_json(session, url) metadata['buildnumber'] = int( re.search('/build(\d+)/', url).group(1)) # Check if localized languages are here (including en-US archive). l10n_parent_url = re.sub('en-US/.+$', '', url) l10n_folders, _ = await fetch_listing(session, l10n_parent_url) for locale in l10n_folders: _, files = await fetch_listing(session, l10n_parent_url + locale) for f in files: rc_url = l10n_parent_url + locale + f['name'] if utils.is_build_url(product, rc_url): record = utils.record_from_url(rc_url) record['download']['size'] = f['size'] record['download']['date'] = f['last_modified'] record = utils.merge_metadata(record, metadata) records_to_create.append(record) # Theorically release should never be there yet :) # And repacks like EME-free/sha1 don't seem to be published in RC. # Nightly metadata # pub/firefox/nightly/2017/08/2017-08-08-11-40-32-mozilla-central/ # firefox-57.0a1.en-US.linux-i686.json # -l10n/... elif utils.is_nightly_build_metadata(product, url): logger.info('Processing {} nightly metadata: {}'.format( product, key)) logger.debug("Fetch new nightly metadata") metadata = await fetch_json(session, url) platform = metadata['moz_pkg_platform'] # Check if english version is here. parent_url = re.sub('/[^/]+$', '/', url) logger.debug("Fetch parent listing {}".format(parent_url)) _, files = await fetch_listing(session, parent_url) for f in files: if ('.' + platform + '.') not in f['name']: # metadata are by platform. continue en_nightly_url = parent_url + f['name'] if utils.is_build_url(product, en_nightly_url): record = utils.record_from_url(en_nightly_url) record['download']['size'] = f['size'] record['download']['date'] = f['last_modified'] record = utils.merge_metadata(record, metadata) records_to_create.append(record) break # Only one file for english. # Check also localized versions. l10n_folder_url = re.sub('-mozilla-central([^/]*)/([^/]+)$', '-mozilla-central\\1-l10n/', url) logger.debug("Fetch l10n listing {}".format(l10n_folder_url)) try: _, files = await fetch_listing(session, l10n_folder_url) except ValueError: files = [] # No -l10/ folder published yet. for f in files: if ('.' + platform + '.') not in f['name'] and product != 'mobile': # metadata are by platform. # (mobile platforms are contained by folder) continue nightly_url = l10n_folder_url + f['name'] if utils.is_build_url(product, nightly_url): record = utils.record_from_url(nightly_url) record['download']['size'] = f['size'] record['download']['date'] = f['last_modified'] record = utils.merge_metadata(record, metadata) records_to_create.append(record) else: logger.info('Ignored {}'.format(key)) logger.debug("{} records to create.".format( len(records_to_create))) for record in records_to_create: # Check that fields values look OK. utils.check_record(record) # Push result to Kinto. kinto_client.create_record(data=record, bucket=bucket, collection=collection, if_not_exists=True) logger.info('Created {}'.format(record['id']))
def test_record_from_url(record): url = record['download']['url'] from_url = record_from_url(url) assert from_url == record
async def csv_to_records( loop, stdin, skip_incomplete=True, min_last_modified=None, cache_folder=CACHE_FOLDER, ): """ :rtype: async generator of records (dict-like) """ async def inventory_by_folder(stdin): previous = None result = [] async for entry in read_csv(stdin): object_key = entry['Key'] folder = os.path.dirname(object_key) if previous is None: previous = folder if previous == folder: result.append(entry) else: yield result previous = folder result = [entry] if result: yield result # Read metadata of previous run, and warm up cache. # Will save a lot of hits to archive.mozilla.org. metadata_cache_file = os.path.join(cache_folder, '.metadata-{}.json'.format(__version__)) if os.path.exists(metadata_cache_file): with open(metadata_cache_file) as f: metadata = json.load(f) _rc_metadata.update(metadata['rc']) _release_metadata.update(metadata['release']) _nightly_metadata.update(metadata['nightly']) async with aiohttp.ClientSession(loop=loop) as session: batch = [] async for entries in inventory_by_folder(stdin): for entry in entries: object_key = entry['Key'] # This is the lowest barrier of entry (no pun intended). # If the entry's 'Key" value doesn't end on any of the # known FILE_EXTENSIONS it will never pass a build URL # later in the loop. if not any( object_key.endswith(ext) for ext in FILE_EXTENSIONS): # Actually, eventually that FILE_EXTENSION check will # be done again more detailed inside the is_build_url # function. # This step was just to weed out some easy ones. continue # When you have a 'min_last_modified' set, and it's something # like 24 hours, then probably 99% of records can be skipped # with this little date comparison. So do this check # for a skip as early as possible. # See https://github.com/mozilla-services/buildhub/issues/427 # Note! ciso8601.parse_datetime will always return a timezone # aware datetime.datetime instance with tzinfo=UTC. lastmodified = ciso8601.parse_datetime( entry['LastModifiedDate']) if min_last_modified and lastmodified < min_last_modified: continue try: # /pub/thunderbird/nightly/... product = object_key.split('/')[1] except IndexError: continue # e.g. https://archive.mozilla.org/favicon.ico if product not in PRODUCTS: continue url = key_to_archive_url(object_key) if not is_build_url(product, url): continue try: record = record_from_url(url) except Exception as e: logger.exception(e) continue # Scan the list of candidates metadata (no-op if # already initialized). await scan_candidates(session, product) # Complete with info that can't be obtained from the URL. filesize = int(float(entry['Size'])) # e.g. 2E+10 lastmodified = lastmodified.strftime(DATETIME_FORMAT) record['download']['size'] = filesize record['download']['date'] = lastmodified batch.append(record) if len(batch) == NB_PARALLEL_REQUESTS: async for result in process_batch(session, batch, skip_incomplete): yield result batch = [] # Go on. # Last loop iteration. async for result in process_batch(session, batch, skip_incomplete): yield result # Save accumulated metadata for next runs. tmpfilename = metadata_cache_file + '.tmp' metadata = { 'rc': _rc_metadata, 'release': _release_metadata, 'nightly': _nightly_metadata, } json.dump(metadata, open(tmpfilename, 'w')) os.rename(tmpfilename, metadata_cache_file)