async def csv_to_records(loop, stdin, skip_incomplete=True):
    """
    :rtype: async generator of records (dict-like)
    """
    async def inventory_by_folder(stdin):
        previous = None
        result = []
        async for entry in read_csv(stdin):
            object_key = entry['Key']
            folder = os.path.dirname(object_key)

            if previous is None:
                previous = folder

            if previous == folder:
                result.append(entry)
            else:
                yield result
                previous = folder
                result = [entry]
        if result:
            yield result

    def deduplicate_entries(entries):
        # Windows releases are published as both .zip and .exe files.
        # Deduplicate these (keep .exe only).
        # Some old Linux versions (1.5b2) were published with installer.tar.gz.
        filtered = [e for e in entries if e['Key'].endswith('.exe')]
        if len(filtered) > 0:
            entries = filtered
        longer_first = sorted(entries,
                              key=lambda e: len(e['Key']),
                              reverse=True)
        deduplicate = {
            e['Key'].lower().replace('+setup+', '-').replace(
                '.installer.exe',
                '').replace('.exe',
                            '').replace('.installer.tar.gz',
                                        '').replace('.tar.gz',
                                                    '').replace('.zip', ''): e
            for e in longer_first
        }
        return deduplicate.values()

    # Read metadata of previous run, and warm up cache.
    # Will save a lot of hits to archive.mozilla.org.
    metadata_cache_file = os.path.join(CACHE_FOLDER,
                                       '.metadata-{}.json'.format(__version__))
    if os.path.exists(metadata_cache_file):
        metadata = json.load(open(metadata_cache_file))
        _rc_metadata.update(metadata['rc'])
        _release_metadata.update(metadata['release'])
        _nightly_metadata.update(metadata['nightly'])

    async with aiohttp.ClientSession(loop=loop) as session:
        batch = []

        async for entries in inventory_by_folder(stdin):
            entries = deduplicate_entries(entries)

            for entry in entries:
                object_key = entry['Key']

                try:
                    product = object_key.split('/')[
                        1]  # /pub/thunderbird/nightly/...
                except IndexError:
                    continue  # e.g. https://archive.mozilla.org/favicon.ico

                if product not in PRODUCTS:
                    continue

                # Scan the list of candidates metadata (no-op if already initialized).
                await scan_candidates(session, product)

                url = ARCHIVE_URL + object_key.replace('+', ' ')

                if not is_build_url(product, url):
                    continue
                try:
                    record = record_from_url(url)
                except Exception as e:
                    logger.exception(e)
                    continue

                # Complete with info that can't be obtained from the URL.
                filesize = int(float(entry['Size']))  # e.g. 2E+10
                lastmodified = datetime.datetime.strptime(
                    entry['LastModifiedDate'], '%Y-%m-%dT%H:%M:%S.%fZ')
                lastmodified = lastmodified.strftime(DATETIME_FORMAT)
                record['download']['size'] = filesize
                record['download']['date'] = lastmodified

                if len(batch) < NB_PARALLEL_REQUESTS:
                    batch.append(record)
                else:
                    async for result in process_batch(session, batch,
                                                      skip_incomplete):
                        yield result

                    batch = []  # Go on.

        # Last loop iteration.
        async for result in process_batch(session, batch, skip_incomplete):
            yield result

    # Save accumulated metadata for next runs.
    tmpfilename = metadata_cache_file + '.tmp'
    metadata = {
        'rc': _rc_metadata,
        'release': _release_metadata,
        'nightly': _nightly_metadata,
    }
    json.dump(metadata, open(tmpfilename, 'w'))
    os.rename(tmpfilename, metadata_cache_file)
示例#2
0
async def main(loop, event):
    """
    Trigger when S3 event kicks in.
    http://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html
    """
    server_url = config('SERVER_URL', default='http://localhost:8888/v1')
    bucket = config('BUCKET', default='build-hub')
    collection = config('COLLECTION', default='releases')
    kinto_auth = tuple(config('AUTH', 'user:pass').split(':'))

    kinto_client = kinto_http.Client(server_url=server_url,
                                     auth=kinto_auth,
                                     retry=NB_RETRY_REQUEST)

    records = []
    for record in event['Records']:
        if record.get('EventSource') == 'aws:sns':
            records.extend(json.loads(record['Sns']['Message'])['Records'])
        else:
            records.append(record)

    async with aiohttp.ClientSession(loop=loop) as session:
        for event_record in records:
            metrics.incr('s3_event_event')
            records_to_create = []

            # Use event time as archive publication.
            event_time = ciso8601.parse_datetime(event_record['eventTime'])
            event_time = event_time.strftime(utils.DATETIME_FORMAT)

            key = event_record['s3']['object']['key']

            filesize = event_record['s3']['object']['size']
            url = utils.key_to_archive_url(key)

            logger.debug("Event file {}".format(url))

            try:
                product = key.split('/')[1]  # /pub/thunderbird/nightly/...
            except IndexError:
                continue  # e.g. https://archive.mozilla.org/favicon.ico

            if product not in utils.ALL_PRODUCTS:
                logger.info('Skip product {}'.format(product))
                continue

            # Release / Nightly / RC archive.
            if utils.is_build_url(product, url):
                logger.info('Processing {} archive: {}'.format(product, key))

                record = utils.record_from_url(url)
                # Use S3 event infos for the archive.
                record['download']['size'] = filesize
                record['download']['date'] = event_time

                # Fetch release metadata.
                await scan_candidates(session, product)
                logger.debug("Fetch record metadata")
                # metadata = await fetch_metadata(session, record)
                metadata = await fetch_metadata(session, record)
                # If JSON metadata not available, archive will be
                # handled when JSON is delivered.
                if metadata is None:
                    logger.info(f"JSON metadata not available {record['id']}")
                    continue

                # Merge obtained metadata.
                record = utils.merge_metadata(record, metadata)
                records_to_create.append(record)

            # RC metadata
            elif utils.is_rc_build_metadata(product, url):
                logger.info(f'Processing {product} RC metadata: {key}')

                # pub/firefox/candidates/55.0b12-candidates/build1/mac/en-US/
                # firefox-55.0b12.json
                logger.debug("Fetch new metadata")
                # It has been known to happen that right after an S3 Event
                # there's a slight delay to the metadata json file being
                # available. If that's the case we want to retry in a couple
                # of seconds to see if it's available on the next backoff
                # attempt.
                metadata = await fetch_json(session,
                                            url,
                                            retry_on_notfound=True)
                metadata['buildnumber'] = int(
                    re.search('/build(\d+)/', url).group(1))

                # We just received the metadata file. Lookup if the associated
                # archives are here too.
                archives = []
                if 'multi' in url:
                    # For multi we just check the associated archive
                    # is here already.
                    parent_folder = re.sub('multi/.+$', 'multi/', url)
                    _, files = await fetch_listing(session,
                                                   parent_folder,
                                                   retry_on_notfound=True)
                    for f in files:
                        rc_url = parent_folder + f['name']
                        if utils.is_build_url(product, rc_url):
                            archives.append(
                                (rc_url, f['size'], f['last_modified']))
                else:
                    # For en-US it's different, it applies to every
                    # localized archives.
                    # Check if they are here by listing the parent folder
                    # (including en-US archive).
                    l10n_parent_url = re.sub('en-US/.+$', '', url)
                    l10n_folders, _ = await fetch_listing(
                        session,
                        l10n_parent_url,
                        retry_on_notfound=True,
                    )
                    for locale in l10n_folders:
                        _, files = await fetch_listing(
                            session,
                            l10n_parent_url + locale,
                            retry_on_notfound=True,
                        )
                        for f in files:
                            rc_url = l10n_parent_url + locale + f['name']
                            if utils.is_build_url(product, rc_url):
                                archives.append((
                                    rc_url,
                                    f['size'],
                                    f['last_modified'],
                                ))

                for rc_url, size, last_modified in archives:
                    record = utils.record_from_url(rc_url)
                    record['download']['size'] = size
                    record['download']['date'] = last_modified
                    record = utils.merge_metadata(record, metadata)
                    records_to_create.append(record)
                # Theorically release should never be there yet :)
                # And repacks like EME-free/sha1 don't seem to be
                # published in RC.

            # Nightly metadata
            # pub/firefox/nightly/2017/08/2017-08-08-11-40-32-mozilla-central/
            # firefox-57.0a1.en-US.linux-i686.json
            # -l10n/...
            elif utils.is_nightly_build_metadata(product, url):
                logger.info(f'Processing {product} nightly metadata: {key}')

                logger.debug("Fetch new nightly metadata")
                # See comment above about the exceptional need of
                # setting retry_on_notfound here.
                metadata = await fetch_json(session,
                                            url,
                                            retry_on_notfound=True)

                platform = metadata['moz_pkg_platform']

                # Check if english version is here.
                parent_url = re.sub('/[^/]+$', '/', url)
                logger.debug("Fetch parent listing {}".format(parent_url))
                _, files = await fetch_listing(session, parent_url)
                for f in files:
                    if ('.' + platform + '.') not in f['name']:
                        # metadata are by platform.
                        continue
                    en_nightly_url = parent_url + f['name']
                    if utils.is_build_url(product, en_nightly_url):
                        record = utils.record_from_url(en_nightly_url)
                        record['download']['size'] = f['size']
                        record['download']['date'] = f['last_modified']
                        record = utils.merge_metadata(record, metadata)
                        records_to_create.append(record)
                        break  # Only one file for english.

                # Check also localized versions.
                l10n_folder_url = re.sub('-mozilla-central([^/]*)/([^/]+)$',
                                         '-mozilla-central\\1-l10n/', url)
                logger.debug("Fetch l10n listing {}".format(l10n_folder_url))
                try:
                    _, files = await fetch_listing(
                        session,
                        l10n_folder_url,
                        retry_on_notfound=True,
                    )
                except ValueError:
                    files = []  # No -l10/ folder published yet.
                for f in files:
                    if (('.' + platform + '.') not in f['name']
                            and product != 'mobile'):
                        # metadata are by platform.
                        # (mobile platforms are contained by folder)
                        continue
                    nightly_url = l10n_folder_url + f['name']
                    if utils.is_build_url(product, nightly_url):
                        record = utils.record_from_url(nightly_url)
                        record['download']['size'] = f['size']
                        record['download']['date'] = f['last_modified']
                        record = utils.merge_metadata(record, metadata)
                        records_to_create.append(record)

            else:
                logger.info('Ignored {}'.format(key))

            logger.debug(f"{len(records_to_create)} records to create.")
            with metrics.timer('s3_event_records_to_create'):
                for record in records_to_create:
                    # Check that fields values look OK.
                    utils.check_record(record)
                    # Push result to Kinto.
                    kinto_client.create_record(data=record,
                                               bucket=bucket,
                                               collection=collection,
                                               if_not_exists=True)
                    logger.info('Created {}'.format(record['id']))
                    metrics.incr('s3_event_record_created')
示例#3
0
async def main(loop, event):
    """
    Trigger when S3 event kicks in.
    http://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html
    """
    server_url = os.getenv('SERVER_URL', 'http://localhost:8888/v1')
    bucket = os.getenv('BUCKET', 'build-hub')
    collection = os.getenv('COLLECTION', 'releases')
    kinto_auth = tuple(os.getenv('AUTH', 'user:pass').split(':'))

    kinto_client = kinto_http.Client(server_url=server_url,
                                     auth=kinto_auth,
                                     retry=NB_RETRY_REQUEST)

    records = []
    for record in event['Records']:
        if record.get('EventSource') == 'aws:sns':
            records.extend(json.loads(record['Sns']['Message'])['Records'])
        else:
            records.append(record)

    async with aiohttp.ClientSession(loop=loop) as session:
        for event_record in records:
            records_to_create = []

            # Use event time as archive publication.
            event_time = datetime.datetime.strptime(event_record['eventTime'],
                                                    '%Y-%m-%dT%H:%M:%S.%fZ')
            event_time = event_time.strftime(utils.DATETIME_FORMAT)

            key = event_record['s3']['object']['key']
            filesize = event_record['s3']['object']['size']
            url = utils.ARCHIVE_URL + key
            logger.debug("Event file {}".format(url))

            try:
                product = key.split('/')[1]  # /pub/thunderbird/nightly/...
            except IndexError:
                continue  # e.g. https://archive.mozilla.org/favicon.ico

            if product not in utils.ALL_PRODUCTS:
                logger.info('Skip product {}'.format(product))
                continue

            # Release / Nightly / RC archive.
            if utils.is_build_url(product, url):
                logger.info('Processing {} archive: {}'.format(product, key))

                record = utils.record_from_url(url)
                # Use S3 event infos for the archive.
                record['download']['size'] = filesize
                record['download']['date'] = event_time

                # Fetch release metadata.
                await scan_candidates(session, product)
                logger.debug("Fetch record metadata")
                metadata = await fetch_metadata(session, record)
                # If JSON metadata not available, archive will be handled when JSON
                # is delivered.
                if metadata is None:
                    logger.info('JSON metadata not available {}'.format(
                        record['id']))
                    continue

                # Merge obtained metadata.
                record = utils.merge_metadata(record, metadata)
                records_to_create.append(record)

            # RC metadata
            elif utils.is_rc_build_metadata(product, url):
                logger.info('Processing {} RC metadata: {}'.format(
                    product, key))

                # pub/firefox/candidates/55.0b12-candidates/build1/mac/en-US/
                # firefox-55.0b12.json
                logger.debug("Fetch new metadata")
                metadata = await fetch_json(session, url)
                metadata['buildnumber'] = int(
                    re.search('/build(\d+)/', url).group(1))

                # Check if localized languages are here (including en-US archive).
                l10n_parent_url = re.sub('en-US/.+$', '', url)
                l10n_folders, _ = await fetch_listing(session, l10n_parent_url)
                for locale in l10n_folders:
                    _, files = await fetch_listing(session,
                                                   l10n_parent_url + locale)
                    for f in files:
                        rc_url = l10n_parent_url + locale + f['name']
                        if utils.is_build_url(product, rc_url):
                            record = utils.record_from_url(rc_url)
                            record['download']['size'] = f['size']
                            record['download']['date'] = f['last_modified']
                            record = utils.merge_metadata(record, metadata)
                            records_to_create.append(record)
                # Theorically release should never be there yet :)
                # And repacks like EME-free/sha1 don't seem to be published in RC.

            # Nightly metadata
            # pub/firefox/nightly/2017/08/2017-08-08-11-40-32-mozilla-central/
            # firefox-57.0a1.en-US.linux-i686.json
            # -l10n/...
            elif utils.is_nightly_build_metadata(product, url):
                logger.info('Processing {} nightly metadata: {}'.format(
                    product, key))

                logger.debug("Fetch new nightly metadata")
                metadata = await fetch_json(session, url)

                platform = metadata['moz_pkg_platform']

                # Check if english version is here.
                parent_url = re.sub('/[^/]+$', '/', url)
                logger.debug("Fetch parent listing {}".format(parent_url))
                _, files = await fetch_listing(session, parent_url)
                for f in files:
                    if ('.' + platform + '.') not in f['name']:
                        # metadata are by platform.
                        continue
                    en_nightly_url = parent_url + f['name']
                    if utils.is_build_url(product, en_nightly_url):
                        record = utils.record_from_url(en_nightly_url)
                        record['download']['size'] = f['size']
                        record['download']['date'] = f['last_modified']
                        record = utils.merge_metadata(record, metadata)
                        records_to_create.append(record)
                        break  # Only one file for english.

                # Check also localized versions.
                l10n_folder_url = re.sub('-mozilla-central([^/]*)/([^/]+)$',
                                         '-mozilla-central\\1-l10n/', url)
                logger.debug("Fetch l10n listing {}".format(l10n_folder_url))
                try:
                    _, files = await fetch_listing(session, l10n_folder_url)
                except ValueError:
                    files = []  # No -l10/ folder published yet.
                for f in files:
                    if ('.' + platform +
                            '.') not in f['name'] and product != 'mobile':
                        # metadata are by platform.
                        # (mobile platforms are contained by folder)
                        continue
                    nightly_url = l10n_folder_url + f['name']
                    if utils.is_build_url(product, nightly_url):
                        record = utils.record_from_url(nightly_url)
                        record['download']['size'] = f['size']
                        record['download']['date'] = f['last_modified']
                        record = utils.merge_metadata(record, metadata)
                        records_to_create.append(record)

            else:
                logger.info('Ignored {}'.format(key))

            logger.debug("{} records to create.".format(
                len(records_to_create)))
            for record in records_to_create:
                # Check that fields values look OK.
                utils.check_record(record)
                # Push result to Kinto.
                kinto_client.create_record(data=record,
                                           bucket=bucket,
                                           collection=collection,
                                           if_not_exists=True)
                logger.info('Created {}'.format(record['id']))
示例#4
0
def test_record_from_url(record):
    url = record['download']['url']
    from_url = record_from_url(url)
    assert from_url == record
async def csv_to_records(
    loop,
    stdin,
    skip_incomplete=True,
    min_last_modified=None,
    cache_folder=CACHE_FOLDER,
):
    """
    :rtype: async generator of records (dict-like)
    """
    async def inventory_by_folder(stdin):
        previous = None
        result = []
        async for entry in read_csv(stdin):
            object_key = entry['Key']
            folder = os.path.dirname(object_key)
            if previous is None:
                previous = folder

            if previous == folder:
                result.append(entry)
            else:
                yield result
                previous = folder
                result = [entry]
        if result:
            yield result

    # Read metadata of previous run, and warm up cache.
    # Will save a lot of hits to archive.mozilla.org.
    metadata_cache_file = os.path.join(cache_folder,
                                       '.metadata-{}.json'.format(__version__))
    if os.path.exists(metadata_cache_file):
        with open(metadata_cache_file) as f:
            metadata = json.load(f)
        _rc_metadata.update(metadata['rc'])
        _release_metadata.update(metadata['release'])
        _nightly_metadata.update(metadata['nightly'])

    async with aiohttp.ClientSession(loop=loop) as session:
        batch = []

        async for entries in inventory_by_folder(stdin):
            for entry in entries:
                object_key = entry['Key']

                # This is the lowest barrier of entry (no pun intended).
                # If the entry's 'Key" value doesn't end on any of the
                # known FILE_EXTENSIONS it will never pass a build URL
                # later in the loop.
                if not any(
                        object_key.endswith(ext) for ext in FILE_EXTENSIONS):
                    # Actually, eventually that FILE_EXTENSION check will
                    # be done again more detailed inside the is_build_url
                    # function.
                    # This step was just to weed out some easy ones.
                    continue

                # When you have a 'min_last_modified' set, and it's something
                # like 24 hours, then probably 99% of records can be skipped
                # with this little date comparison. So do this check
                # for a skip as early as possible.
                # See https://github.com/mozilla-services/buildhub/issues/427
                # Note! ciso8601.parse_datetime will always return a timezone
                # aware datetime.datetime instance with tzinfo=UTC.
                lastmodified = ciso8601.parse_datetime(
                    entry['LastModifiedDate'])
                if min_last_modified and lastmodified < min_last_modified:
                    continue

                try:
                    # /pub/thunderbird/nightly/...
                    product = object_key.split('/')[1]
                except IndexError:
                    continue  # e.g. https://archive.mozilla.org/favicon.ico

                if product not in PRODUCTS:
                    continue

                url = key_to_archive_url(object_key)

                if not is_build_url(product, url):
                    continue
                try:
                    record = record_from_url(url)
                except Exception as e:
                    logger.exception(e)
                    continue

                # Scan the list of candidates metadata (no-op if
                # already initialized).
                await scan_candidates(session, product)

                # Complete with info that can't be obtained from the URL.
                filesize = int(float(entry['Size']))  # e.g. 2E+10
                lastmodified = lastmodified.strftime(DATETIME_FORMAT)
                record['download']['size'] = filesize
                record['download']['date'] = lastmodified

                batch.append(record)

                if len(batch) == NB_PARALLEL_REQUESTS:
                    async for result in process_batch(session, batch,
                                                      skip_incomplete):
                        yield result

                    batch = []  # Go on.

        # Last loop iteration.
        async for result in process_batch(session, batch, skip_incomplete):
            yield result

    # Save accumulated metadata for next runs.
    tmpfilename = metadata_cache_file + '.tmp'
    metadata = {
        'rc': _rc_metadata,
        'release': _release_metadata,
        'nightly': _nightly_metadata,
    }
    json.dump(metadata, open(tmpfilename, 'w'))
    os.rename(tmpfilename, metadata_cache_file)