Exemplo n.º 1
0
def get_caltechdata(collection, production=True, datacite=False):
    """Harvest all records from CaltechDATA .
    Always creates collection from scratch"""
    # Delete existing collection
    if os.path.isdir(collection):
        shutil.rmtree(collection)
    if not dataset.init(collection):
        print("Dataset failed to init collection")
        exit()

    if production == True:
        url = "https://data.caltech.edu/api/records"
    else:
        url = "https://cd-sandbox.tind.io/api/records"

    response = requests.get(url + "/?size=9000")
    hits = response.json()

    print(hits)
    for h in progressbar(hits["hits"]["hits"]):
        rid = str(h["id"])
        # Get enriched metadata records (including files)
        if datacite == False:
            metadata = decustomize_schema(h["metadata"], True, True, True)
            metadata["updated"] = h["updated"]
        else:
            # Get just DataCite metadata
            metadata = decustomize_schema(h["metadata"])

        if not dataset.create(collection, rid, metadata):
            err = dataset.error_message()
            print(err)
Exemplo n.º 2
0
def get_metadata(idv):
    # Returns TCCON version of DataCite metadata

    api_url = "https://data.caltech.edu/api/record/"

    r = requests.get(api_url + str(idv))
    r_data = r.json()
    if "message" in r_data:
        raise AssertionError("id " + str(idv) +
                             " expected http status 200, got " +
                             str(r.status_code) + " " + r_data["message"])
    if not "metadata" in r_data:
        raise AssertionError(
            "expected as metadata property in response, got " + r_data)
    metadata = r_data["metadata"]

    metadata = decustomize_schema(metadata, pass_emails=True, schema="43")

    try:
        assert schema43.validate(metadata)
    except AssertionError:
        v = schema43.validator.validate(metadata)
        errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
        for error in errors:
            print(error.message)
        exit()

    #Add time lag (fixed)
    metadata['release_lag'] = 30

    return metadata
Exemplo n.º 3
0
def read_records(data):
    # read records in 'hits' structure
    for record in data:
        rid = str(record["id"])
        metadata = record["metadata"]
        files = []
        if "electronic_location_and_access" in metadata:
            for erecord in metadata["electronic_location_and_access"]:
                url = erecord["uniform_resource_identifier"]
                fname = erecord["electronic_name"][0]
                f = download_file(url, fname)
                if f != None:
                    files.append(f)
        else:
            print("No Files")
        metadata = decustomize_schema(metadata,
                                      pass_emails=True,
                                      pass_owner=True,
                                      schema="43")
        # Need to figure out identifiers
        metadata.pop("identifiers")
        # Separate family and given names
        for creator in metadata["creators"]:
            name = creator["name"]
            print(name)
            if "," in name:
                print("Yes")
                split = name.split(",")
                creator["familyName"] = split[0]
                creator["givenName"] = split[1]
                creator["nameYype"] = "Personal"
            else:
                creator["nameType"] = "Organizational"
        print(metadata)
        doi = caltechdata_write(metadata,
                                schema="43",
                                pilot=True,
                                files=files,
                                publish=True)
        print(doi)
    # We don't include the HTE records due to Elasticsearch limitations
    chunkurl = f"{url}?q=NOT(subjects:%27HTE%27)&sort=-mostrecent&size=1000&page={c}"
    response = requests.get(chunkurl).json()
    hits += response["hits"]["hits"]

for h in hits:
    rid = str(h["id"])
    doi = str(h["metadata"]["doi"])
    idv = doi.split(f"{prefix}/")[1]

    print(rid)

    # ALSO need to do file checks
    if idv not in existing:

        metadata = decustomize_schema(h["metadata"], True, True, True, "43")
        # Write both the raw API data and DataCite metadata as json files
        location = f"{path}/{doi}/datacite.json"
        upload_json(metadata, bucket, location, s3_boto)
        location = f"{path}/{doi}/raw.json"
        upload_json(h, bucket, location, s3_boto)
        # Download and upload files
        if "electronic_location_and_access" in metadata:
            count = len(metadata["electronic_location_and_access"])
            for erecord in metadata["electronic_location_and_access"]:
                size = float(erecord["file_size"])
                name = erecord["electronic_name"][0]
                if erecord["embargo_status"] == "open":
                    file = f"{path}/{doi}/{name}"
                    with urllib.request.urlopen(
                            erecord["uniform_resource_identifier"]
Exemplo n.º 5
0
    "1920's",
    "1930's",
    "1940's",
    "1950's",
    "1960's",
    "1970's",
    "1980's",
    "1990's",
    "2000's",
    "2010's",
    "2020's",
]

for h in hits["hits"]["hits"]:
    print("Adding record: ", h["id"])
    metadata = decustomize_schema(h["metadata"])
    if "geoLocations" in metadata:
        geo = metadata["geoLocations"]
        for g in geo:
            if "geoLocationBox" in g:
                box = g["geoLocationBox"]
                lat = [
                    box["northBoundLatitude"],
                    box["northBoundLatitude"],
                    box["southBoundLatitude"],
                    box["southBoundLatitude"],
                ]
                lon = [
                    box["eastBoundLongitude"],
                    box["westBoundLongitude"],
                    box["eastBoundLongitude"],
Exemplo n.º 6
0
import os, requests
from caltechdata_api import decustomize_schema
from ames.matchers import update_citation

# Get access token from TIND sed as environment variable with source token.bash
token = os.environ["TINDTOK"]

production = True

if production == True:
    url = "https://data.caltech.edu/api/records"
else:
    url = "https://cd-sandbox.tind.io/api/records"

response = requests.get(url + "/?size=1000&q=subjects:TCCON")
hits = response.json()

for h in hits["hits"]["hits"]:
    rid = h["id"]
    print(rid)
    record = decustomize_schema(h["metadata"], True)
    update_citation(record, rid, token)