def extract(dockerfile, contact, container_name=None, output_html=True): '''extract a dataset from a given dockerfile, write to html output file. Use container-diff and spython to get information about the container. ''' # Step 0. Define absolute paths to our Dockerfile, recipe, output here = os.path.abspath(os.path.dirname(__file__)) recipe_yml = os.path.join(here, "recipe.yml") spec_yml = os.path.join(here, "specification.yml") # Step 1: Show required and recommended fields from recipe recipe = RecipeParser(recipe_yml) # Step 2: Create Dataset parser = DockerRecipe(dockerfile) image = Schema(spec_yml) # We can obtain these from the environment, or use reasonable defaults thumbnail = os.environ.get( 'IMAGE_THUMBNAIL', 'https://vsoch.github.io/datasets/assets/img/avocado.png') about = os.environ.get( 'IMAGE_ABOUT', 'This is a Dockerfile parsed by the openschemas/extractors container.') repository = os.environ.get('GITHUB_REPOSITORY', 'openschemas/extractors') description = os.environ.get('IMAGE_DESCRIPTION', 'A Dockerfile build recipe') # Step 3: Generate a Person (these are Google Helper functions) contact = os.environ.get('GITHUB_ACTOR', contact) contact_url = os.environ.get('CONTACT_URL', repository) contact_description = os.environ.get('CONTACT_DESCRIPTION', 'Dockerfile maintainer') contact_type = os.environ.get('CONTACT_TYPE', 'customer support') contact_telephone = os.environ.get('CONTACT_TELEPHONE') # Get the repository full url for contact if not contact_url.startswith('http'): contact_url = "https://www.github.com/%s" % contact_url if contact is not None: person = make_person(name=contact, description=contact_description, url=contact_url, contact_type=contact_type, telephone=contact_telephone) image.properties['creator'] = person image.properties['author'] = person # image.properties if len(parser.environ) > 0: image.properties['environment'] = parser.environ image.properties['entrypoint'] = parser.entrypoint image.properties['version'] = image.version image.properties['description'] = description image.properties['ContainerImage'] = parser.fromHeader image.properties['name'] = container_name # Fun properties :) image.properties['thumbnailUrl'] = thumbnail image.properties['sameAs'] = 'ImageDefinition' image.properties['about'] = about image.properties[ 'codeRepository'] = 'https://www.github.com/%s' % repository image.properties['runtime'] = 'Docker' # Generate temporary filename output_file = "%s.json" % get_tmpfile("image-definition") # Try using container name, if not available default to ContainerImage (FROM) layers = run_container_diff(container_name, parser.fromHeader, output_file) if len(layers) > 0: # softwareRequirements requires = [] # APT and PIP # note that the top level key here can be history, files, pip, apt, etc. for layer in layers: ## Pip and Apt will go into softwareRequirements if layer['AnalyzeType'] in ["Pip", "Apt"]: for pkg in layer['Analysis']: requires.append( '%s > %s==%s' % (layer['AnalyzeType'], pkg['Name'], pkg['Version'])) image.properties["softwareRequirements"] = requires if output_html: return make_dataset(image) return image.dump_json(pretty_print=True)
def extract(name, version=None, contact=None, output_html=True, description=None, thumbnail=None, sameAs=None, about=None, repository=None): ''' extract a Dataset to describe some Github repository. To add more properties, just add them via additional keyword args (kwargs) Parameters ========== url: the url to get the catalog name: the name of the DataCatalog contact: name of a person that is in charge of the dataset description: a description of the DataCatalog thumbnail: an image thumbnail (web url) about: text about the data catalog (optional). ''' # Step 0. Define absolute paths to our Dockerfile, recipe, output here = os.path.abspath(os.path.dirname(__file__)) recipe_yml = os.path.join(here, "recipe.yml") # Step 1: Show required and recommended fields from recipe recipe = RecipeParser(recipe_yml) # Step 2: Create Dataset dataset = Schema("Dataset") # We can obtain these from the environment, or use reasonable defaults thumbnail = os.environ.get( 'DATASET_THUMBNAIL', thumbnail or 'https://vsoch.github.io/datasets/assets/img/avocado.png') about = os.environ.get( 'DATASET_ABOUT', about or 'This is a Dataset parsed by the openschemas/extractors container.') repository = os.environ.get('GITHUB_REPOSITORY', repository or 'openschemas/extractors') description = os.environ.get('DATASET_DESCRIPTION', 'A Dataset') email = os.environ.get('DATASET_EMAIL') template = os.environ.get('DATASET_TEMPLATE', "google/dataset-table.html") # Can be one of: # google/dataset-table.html (bootstrap) # google/visual-dataset.html (see vsoch.github.io/zenodo-ml) # google/dataset.html (just blank page, json metadata) # google/dataset-vue-table.html # see https://openschemas.github.io/schemaorg#7-embed-in-html-with-json-ld # Contact metadata contact = os.environ.get('GITHUB_ACTOR', contact) contact_url = os.environ.get('CONTACT_URL', repository) contact_description = os.environ.get('CONTACT_DESCRIPTION', 'Dataset maintainer') contact_type = os.environ.get('CONTACT_TYPE', 'customer support') contact_telephone = os.environ.get('CONTACT_TELEPHONE') contact = add_kwargs(contact, 'DATASET_DOWNLOAD_KWARGS') # Download Link download_link = os.environ.get('DATASET_DOWNLOAD_LINK') encoding = os.environ.get('DATASET_ENCODING_FORMAT') if download != None: download = Schema('DataDownload') download.add_property('encodingFormat', encoding) download.add_property('contentUrl', download_link) download = add_kwargs(download, 'DATASET_DOWNLOAD_KWARGS') dataset.add_property('distribution', [download]) # Get the repository full url for contact if not contact_url.startswith('http'): contact_url = "https://www.github.com/%s" % contact_url if contact is not None: person = make_person(name=contact, description=contact_description, url=contact_url, contact_type=contact_type, telephone=contact_telephone) person = add_kwargs(person, 'CONTACT_KWARGS') dataset.add_property('creator', person) # dataset.properties dataset.add_property('version', version) dataset.add_property('description', description) dataset.add_property('name', name) dataset.add_property('thumbnailUrl', thumbnail) dataset.add_property('about', about) dataset = add_kwargs(dataset, 'DATASET_KWARGS') # Step 5: Validate Data Structure recipe.validate(dataset) if output_html: return make_dataset(dataset, template=template) return dataset.dump_json(pretty_print=True)