Пример #1
0
def main(argv):
    del argv  # Only used by app.run().
    filenames = glob.glob(FLAGS.input, recursive=True)
    logging.info('Found %d datasets', len(filenames))
    dois = collections.defaultdict(list)
    for filename in filenames:
        logging.info('Checking %s', filename)
        dataset = message_helpers.load_message(filename, dataset_pb2.Dataset)
        dataset_id = os.path.splitext(os.path.basename(filename))[0]
        assert dataset.dataset_id == dataset_id
        doi_set = set()
        for reaction in dataset.reactions:
            # Some poorly-validated DOI entries start with 'doi:'...
            match = re.fullmatch(r'(?:(?:doi)|(?:DOI))?:?\s*(.*)',
                                 reaction.provenance.doi)
            doi_set.add(match.group(1))
        for doi in doi_set:
            dois[doi].append(dataset_id)
    for doi in sorted(dois):
        print(f'* [{doi}](https://doi.org/{doi})')
        for dataset in sorted(dois[doi]):
            url = urllib.parse.urljoin(
                _PREFIX,
                message_helpers.id_filename(dataset) + '.pbtxt')
            print(f'  * [{dataset}]({url})')
Пример #2
0
def extract_data(message, root, min_size=0.0, max_size=1.0):
    """Replaces large Data values with pointers to offloaded data.

    Git LFS (https://git-lfs.github.com/) is convenient because it lives in the
    same repo as the associated Reaction records. However, it is not possible to
    get a permanent URL for the uploaded data because it is only committed to
    the PR branch. We have (at least) these options:

        1. Modify the URL just before or after the PR is merged to point to the
           correct branch.
        2. Modify the URL to point to its eventual destination (in the `main`
           branch) and deal with broken links during submission review, or
        3. Use relative paths (relative to the repository root). This means that
           users will have to traverse the repo manually to access referenced
           data instead of simply following a URL.
        4. Merge the data immediately in another repo so the URL is permanent.

    I think (2) is the best option because it yields URLs that will eventually
    work and it is simpler than (1). I don't like option (4) because it requires
    data to be committed and merged before review.

    Args:
        message: Protocol buffer message.
        root: Text root of the repository.
        min_size: Float minimum size of data before it will be written (in MB).
        max_size: Float maximum size of data to write (in MB).

    Returns:
        Set of text filenames; the generated Data files.
    """
    dirname = tempfile.mkdtemp()
    data_messages = message_helpers.find_submessages(message,
                                                     reaction_pb2.Data)
    filenames = set()
    for data_message in data_messages:
        data_filename, data_size = write_data(data_message,
                                              dirname,
                                              min_size=min_size,
                                              max_size=max_size)
        if data_filename:
            basename = os.path.basename(data_filename)
            output_filename = message_helpers.id_filename(basename)
            with_root = flask.safe_join(root, output_filename)
            if os.path.exists(with_root):
                warnings.warn(f'Target Data blob already exists: {with_root}')
            else:
                os.makedirs(os.path.dirname(with_root), exist_ok=True)
                shutil.copy2(data_filename, with_root)
                filenames.add(with_root)
            data_message.url = urllib.parse.urljoin(DATA_URL_PREFIX,
                                                    output_filename)
            logging.info('Created Data link (%g MB): %s', data_size, with_root)
    shutil.rmtree(dirname)
    return filenames
Пример #3
0
def _run_updates(inputs, datasets):
    """Updates the submission files.

    Args:
        inputs: List of FileStatus objects.
        datasets: Dict mapping filenames to Dataset messages.
    """
    for dataset in datasets.values():
        # Set reaction_ids, resolve names, fix cross-references, etc.
        updates.update_dataset(dataset)
        # Offload large Data values.
        data_filenames = data_storage.extract_data(dataset,
                                                   FLAGS.root,
                                                   min_size=FLAGS.min_size,
                                                   max_size=FLAGS.max_size)
        if data_filenames:
            args = ['git', 'add'] + list(data_filenames)
            logging.info('Running command: %s', ' '.join(args))
            subprocess.run(args, check=True)
    combined = _combine_datasets(datasets)
    # Final validation to make sure we didn't break anything.
    options = validations.ValidationOptions(validate_ids=True,
                                            require_provenance=True)
    validations.validate_datasets({'_COMBINED': combined},
                                  FLAGS.write_errors,
                                  options=options)
    if FLAGS.output:
        output_filename = FLAGS.output
    else:
        _, suffix = os.path.splitext(inputs[0].filename)
        output_filename = os.path.join(
            FLAGS.root,
            message_helpers.id_filename(f'{combined.dataset_id}{suffix}'))
    os.makedirs(os.path.dirname(output_filename), exist_ok=True)
    if FLAGS.cleanup:
        cleanup(inputs, output_filename)
    logging.info('writing combined Dataset to %s', output_filename)
    message_helpers.write_message(combined, output_filename)
    # Write a binary version for fast read/write.
    root, ext = os.path.splitext(output_filename)
    if FLAGS.write_binary and ext != '.pb':
        binary_filename = root + '.pb'
        logging.info('writing combined Dataset (binary) to %s',
                     binary_filename)
        message_helpers.write_message(combined, binary_filename)
        args = ['git', 'add', binary_filename]
        logging.info('Running command: %s', ' '.join(args))
        subprocess.run(args, check=True)
Пример #4
0
def main(argv):
    del argv  # Only used by app.run().
    inputs = sorted(_get_inputs())
    if not inputs:
        logging.info('nothing to do')
        return  # Nothing to do.
    datasets = {}
    for file_status in inputs:
        datasets[file_status.filename] = message_helpers.load_message(
            file_status.filename, dataset_pb2.Dataset)
    if FLAGS.validate:
        validations.validate_datasets(datasets, FLAGS.write_errors)
    if not FLAGS.update:
        logging.info('nothing else to do; use --update for more')
        return  # Nothing else to do.
    for dataset in datasets.values():
        for reaction in dataset.reactions:
            updates.update_reaction(reaction)
        # Offload large Data values.
        data_filenames = data_storage.extract_data(dataset,
                                                   FLAGS.root,
                                                   min_size=FLAGS.min_size,
                                                   max_size=FLAGS.max_size)
        if data_filenames:
            args = ['git', 'add'] + data_filenames
            logging.info('Running command: %s', ' '.join(args))
            subprocess.run(args, check=True)
    combined = _combine_datasets(datasets)
    # Final validation to make sure we didn't break anything.
    validations.validate_datasets({'_COMBINED': combined}, FLAGS.write_errors)
    if FLAGS.output:
        output_filename = FLAGS.output
    else:
        _, suffix = os.path.splitext(inputs[0].filename)
        output_filename = os.path.join(
            FLAGS.root,
            message_helpers.id_filename(f'{combined.dataset_id}{suffix}'))
    os.makedirs(os.path.dirname(output_filename), exist_ok=True)
    if FLAGS.cleanup:
        cleanup(inputs, output_filename)
    logging.info('writing combined Dataset to %s', output_filename)
    message_helpers.write_message(combined, output_filename)
Пример #5
0
 def test_id_filename(self, filename, expected):
     self.assertEqual(message_helpers.id_filename(filename), expected)