예제 #1
0
def download_from_nexus(uri, config_file_path, output_path, nexus_endpoint,
                        nexus_bucket, unzip=False):
    forge = KnowledgeGraphForge(
        config_file_path, endpoint=nexus_endpoint, bucket=nexus_bucket)
    dataset = forge.retrieve(id=uri)
    filepath = os.path.join(output_path, dataset.distribution.name)
    print("Downloading the file to '{}'".format(filepath))
    forge.download(
        dataset, path=output_path, overwrite=True,
        follow="distribution.contentUrl")
    if unzip:
        print(f"Decompressing ...")
        with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
            zip_ref.extractall(output_path)
    return dataset
예제 #2
0
def _preprocess_data(data, data_type, auth=None):
    """Preprocess input data according to the specified type.

    Possoble data types are:

    - "raw" use data as is provided in the request
    - "json_pgframe" create a PandasPGFrame from the provided JSON repr
    - "nexus_dataset" download a JSON dataset from Nexus and
      create a PandasPGFrame from this representation
    # - collection of Nexus resources to build a PG from
    # - (then i guess we need a bucket/org/project/token)
    """
    if data_type == "raw":
        # Use passed data as is
        return data
    elif data_type == "json_pgframe":
        return PandasPGFrame.from_json(data)
    elif data_type == "nexus_dataset":
        if auth is None:
            raise ValueError(
                "To use Nexus-hosted property graph as the dataset "
                "authentication token should be provided in the "
                "request header")
        forge = KnowledgeGraphForge(app.config["FORGE_CONFIG"],
                                    endpoint=data["endpoint"],
                                    bucket=data["bucket"],
                                    token=auth)
        resource = forge.retrieve(data["resource_id"])
        forge.download(resource, "distribution.contentUrl",
                       app.config["DOWNLOAD_DIR"])
        downloaded_file = os.path.join(app.config["DOWNLOAD_DIR"],
                                       resource.distribution.name)
        graph = PandasPGFrame.load_json(downloaded_file)
        os.remove(downloaded_file)
        return graph
    else:
        raise ValueError("Unknown data type")