Python setprop 예제들, dplaingestion.selector.setprop Python 예제들

예제 #1

0

파일 보기

파일: artstor_cleanup_creator.py 프로젝트: amber-reichert/ingestion

def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"):
    """
    Service that accepst a JSON document and removes cleans the
    sourceResource/creator field by removing the values in REGEXES if the
    field value begins with them
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        item = getprop(data, prop)
        if not isinstance(item, list):
            item = [item]
        for i in range(len(item)):
            for s in CLEANUP:
                item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip()
            
        setprop(data, prop, item[0] if len(item) == 1 else item)

    return json.dumps(data)

예제 #2

0

파일 보기

파일: usc_set_dataprovider.py 프로젝트: dpla/ingestion

def uscsetdataprovider(body, ctype, prop="dataProvider"):
    """   
    Service that accepts a JSON document and sets the "dataProvider"
    field of that document to:

    1. The first value of the originalRecord/source field (placed in
       dataProvider in the oai-to-dpla module) for the chs set (setSpec
       p15799coll65)
    2. The string "University of Southern California. Libraries" for all
       other sets

    For primary use with USC documents
    """

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"


    data_provider = getprop(data, "dataProvider", True)
    if getprop(data, "originalRecord/setSpec") == "p15799coll65":
        setprop(data, "dataProvider", data_provider[0])
    else:
        setprop(data, "dataProvider",
                "University of Southern California. Libraries")

    return json.dumps(data)

예제 #3

0

파일 보기

파일: download_preview.py 프로젝트: amber-reichert/ingestion

def update_document(document, filepath, mime, status):
    """
    Updates the document with a filepath of downloaded thumbnail..

    Arguments:
        document object - document for updating (decoded by json module)
        filepath string - filepath to insert

    Returns:
        The document from parameter with additional field containing the
        filepath.
    """
    if filepath:
        base_url = module_config().get('thumbs_root_url')
        obj = document["object"]
        obj["@id"] = base_url + filepath
        obj["format"] = mime
        document["object"] = obj
    if mime:
        obj = document["object"]
        obj["format"] = mime
    if status:
        setprop(document, "admin/object_status", status)

    return document

예제 #4

0

파일 보기

파일: mwdl_enrich_state_located_in.py 프로젝트: amber-reichert/ingestion

def mwdlenrichstatelocatedin(body, ctype, action="mdl_enrich_state_located_in",
                             prop="sourceResource/stateLocatedIn"):
    """
    Service that accepts a JSON document and enriches the "stateLocatedIn"
    field of that document by:

    For primary use with MWDL documents.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        sli = []
        values = getprop(data,prop)
        for v in values.split(";"):
            if STATE_CODES.get(v):
                sli.append(STATE_CODES[v])
            else:
                sli.append(v)
        setprop(data, prop, "; ".join(sli))

    return json.dumps(data)

예제 #5

0

파일 보기

파일: enrich-date.py 프로젝트: eldios/ingestion

def enrich_temporal_date(body, ctype, prop="aggregatedCHO/temporal", date_key="name"):
    """
    Service that accepts a JSON document and extracts the "created date" of the item, using the
    following rules:

    a) Looks in the list of fields specified by the 'prop' parameter
    b) Extracts all dates, and sets the created date to the earliest date
    """
    try :
        data = json.loads(body)
    except:
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return "Unable to parse body as JSON"

    date_candidates = []
    for p in prop.split(','):
        if exists(data, p):
            v = getprop(data, p)
            for s in v:
                a, b = parse_date_or_range(s[date_key])
                date_candidates.append( {
                    "begin": a,
                    "end": b,
                    "displayDate" : s[date_key]
                })
    if date_candidates:
        setprop(data, p, date_candidates)

    return json.dumps(data)

예제 #6

0

파일 보기

파일: decode_html.py 프로젝트: amber-reichert/ingestion

def decode_html(body, ctype, prop=None):
    """Decodes any encoded html in the prop

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to decode
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    REGEX = ('&quot;', '"'), ('&amp;', '&'), ('&lt;', '<'), ('&gt;', '>')

    if prop and exists(data, prop):
        decoded = []
        v = getprop(data, prop)
        if not isinstance(v, list):
            v = [v]
        for s in v:
            if isinstance(s, basestring):
                for p, r in REGEX:
                    s = re.sub(p, r, s)
            decoded.append(s)

        setprop(data, prop, decoded)
                

    return json.dumps(data)

예제 #7

0

파일 보기

파일: set_context.py 프로젝트: dpla/ingestion

def setcontext(body, ctype, prop="@context"):
    """   
    Service that accepts a JSON document and sets the "@context" field of that
    document.
    """

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    item_context = {
        "@context": "http://dp.la/api/items/context",
        "aggregatedCHO": "#sourceResource",
        "@type": "ore:Aggregation"
    }

    collection_context = {
        "@context": "http://dp.la/api/collections/context",
        "@type": "dcmitype:Collection" 
    }

    if data["ingestType"] == "item":
        data.update(item_context)
        setprop(data, "sourceResource/@id", "%s#sourceResource" % data["@id"])
    else:
        data.update(collection_context)

    return json.dumps(data)

예제 #8

0

파일 보기

파일: move_date_values.py 프로젝트: amber-reichert/ingestion

def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("Prop param is None in %s" % __name__)
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = getprop(data, prop)
        remove = []
        toprop = getprop(data, to_prop) if exists(data, to_prop) else []
        
        for v in (values if isinstance(values, list) else [values]):
            c = cleanup(v)
            for pattern in REGSEARCH:
                m = re.compile(pattern, re.I).findall(c)
                if len(m) == 1 and not re.sub(m[0], "", c).strip():
                    if m[0] not in toprop:
                        toprop.append(m[0])
                    # Append the non-cleaned value to remove
                    remove.append(v)
                    break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)

예제 #9

0

파일 보기

파일: mwdl_enrich_location.py 프로젝트: amber-reichert/ingestion

def mdlenrichlocation(body,ctype,action="mwdl_enrich_location", prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document. 

    For primary use with MWDL documents.
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        spatials = []
        for spatial in iterify(getprop(data,prop)):
            if (is_spatial(spatial)): 
                spatials.append(format_spatial(spatial))

        if (len(spatials) > 0): 
            setprop(data, prop, spatials)
        else:
            delprop(data, prop)

    return json.dumps(data)

예제 #10

0

파일 보기

파일: dedup_value.py 프로젝트: amber-reichert/ingestion

def dedup_value(body, ctype, action="dedup_value", prop=None):
    '''
    Service that accepts a JSON document and enriches the prop field of that document by:

    a) Removing duplicates
    '''

    if prop:
        try:
            data = json.loads(body)
        except:
            response.code = 500
            response.add_header('content-type', 'text/plain')
            return "Unable to parse body as JSON"

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    for p in prop.split(","):
        if exists(data, p):
            v = getprop(data, p)
            if isinstance(v, list):
                # Remove whitespace, periods, parens, brackets
                clone = [re.sub("[ \.\(\)\[\]\{\}]", "", s).lower() for s in v]
                # Get index of unique values
                index = list(set([clone.index(s) for s in list(set(clone))]))
            
                setprop(data, p, [v[i] for i in index])

    return json.dumps(data)

예제 #11

0

파일 보기

파일: georgia_set_spec_type.py 프로젝트: peterkingalex/ingestion

def georgiasetspectype(body, ctype):
    """   
    Service that accepts a JSON document and sets the "sourceResource/specType"
    field of that document from the "sourceResource/type" field

    For primary use with DLG documents
    """

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    TYPE_TO_SPEC_TYPE = {
        "books": "Book",
        "government": "Government Document",
        "periodicals": "Serial"
    }

    type = getprop(data, "sourceResource/type", True)
    if type:
        spec_type = []
        for s in iterify(type):
            for k, v in TYPE_TO_SPEC_TYPE.items():
                if k in s.lower() and v not in spec_type:
                    spec_type.append(v)

        if spec_type:
            setprop(data, "sourceResource/specType", spec_type)

    return json.dumps(data)

예제 #12

0

파일 보기

파일: usc_enrich_location.py 프로젝트: amber-reichert/ingestion

def uscenrichlocation(body, ctype, action="usc_enrich_location",
                      prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of
    that document by:

    1. If one of the spatial values is a lat/lon coordinate, removing all other
       values
    2. Removing 1-3 digit numbers and values that contain "s.d"

    For primary use with USC documents.
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        spatial = getprop(data, prop)

        coordinates = find_coordinates(spatial)
        if coordinates:
            spatial = [{"name": "%s, %s" % coordinates}]
        else:
            spatial = clean(spatial)
            spatial = join_values(spatial)

        setprop(data, prop, spatial)

    return json.dumps(data)

예제 #13

0

파일 보기

파일: enrich_language.py 프로젝트: amber-reichert/ingestion

def enrich_language(body, ctype, action="enrich_language", prop="sourceResource/language"):
    '''
    Service that accepts a JSON document and enriches the "language" field of that document
    by:

    a) converting a list of language values into list of dictionaries: {"name": language}

    By default it works on the 'language' field, but can be overridden by passing the name of the field to use
    as a parameter
    '''

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        langs = getprop(data, prop)

        if isinstance(langs, basestring):
            setprop(data, prop, {"name": langs})
        elif isinstance(langs, list):
            languages = []
            for l in langs:
                languages.append({"name": l})
            setprop(data, prop, languages)

    return json.dumps(data)

예제 #14

0

파일 보기

파일: capitalize_value.py 프로젝트: chadfennell/ingestion

def capitalize(data, prop):
    """
    Capitalizes the value of the related property path.
    Modifies given dictionary (data argument).
    """
    def str_capitalize(s):
        """
        Changes the first letter of the string into uppercase.
        python "aaa".capitalize() can be used, other words first letters
        into lowercase.
        """
        if s:
            return s[0].upper() + s[1:]
        return s

    if exists(data, prop):
        v = getprop(data, prop, keyErrorAsNone=True)
        if v:
            if isinstance(v, basestring):
                setprop(data, prop, str_capitalize(v))
            elif isinstance(v, list):
                new_v = []
                for s in v:
                    if isinstance(s, basestring):
                        new_v.append(str_capitalize(s))
                    else:
                        new_v.append(s)
                setprop(data, prop, new_v)

예제 #15

0

파일 보기

파일: enrich_date.py 프로젝트: dpla/ingestion

def check_date_format(data, prop):
    """Checks that the begin and end dates are in the proper format"""
    date = getprop(data, prop, True)
    if date:
        for d in iterify(date):
            for k, v in d.items():
                if v and k != "displayDate":
                    try:
                        ymd = [int(s) for s in v.split("-")]
                    except:
                        err = "Invalid date.%s: non-integer in %s for %s" % \
                              (k, v, data.get("_id"))
                        logger.error(err)
                        setprop(d, k, None)
                        continue

                    year = ymd[0]
                    month = ymd[1] if len(ymd) > 1 else 1
                    day = ymd[2] if len(ymd) > 2 else 1
                    try:
                        datetime.datetime(year=year, month=month, day=day)
                    except ValueError, e:
                        logger.error("Invalid date.%s: %s for %s" %
                                     (k, e, data.get("_id")))
                        setprop(d, k, None)

예제 #16

0

파일 보기

파일: set_spec_type.py 프로젝트: dpla/ingestion

def setspectype(body, ctype, prop="sourceResource/type"):
    """   
    Service that accepts a JSON document and sets the "sourceResource/specType"
    field of that document from the prop field
    """

    try :
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type','text/plain')
        return "Unable to parse body as JSON"

    TYPE_TO_SPEC_TYPE = {
        "book": "Book",
        "government": "Government Document",
        "periodical": "Serial",
        "nonmusic": "Nonmusic",
        "still image": "Photograph/Pictorial Works",
        "mixed material": "Mixed Material"
    }

    if exists(data, prop):
        spec_type = []
        for s in iterify(getprop(data, prop)):
            for k, v in TYPE_TO_SPEC_TYPE.items():
                if k in s.lower() and v not in spec_type:
                    spec_type.append(v)

            if spec_type:
                setprop(data, "sourceResource/specType", spec_type)

    return json.dumps(data)

예제 #17

0

파일 보기

파일: enrich-date.py 프로젝트: eldios/ingestion

def enrichdate(body, ctype, action="enrich-format", prop="aggregatedCHO/date"):
    """
    Service that accepts a JSON document and extracts the "created date" of the item, using the
    following rules:

    a) Looks in the list of fields specified by the 'prop' parameter
    b) Extracts all dates, and sets the created date to the earliest date 
    """
    try :
        data = json.loads(body)
    except:
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE,  HTTP_TYPE_TEXT)
        return "Unable to parse body as JSON"

    date_candidates = []
    for p in prop.split(','):
        if exists(data, p):
            v = getprop(data, p)
            date_candidates = []
            for s in (v if not isinstance(v, basestring) else [v]):
                a, b = parse_date_or_range(s)
                date_candidates.append( {
                        "begin": a,
                        "end": b,
                        "displayDate" : s
                        })
        date_candidates.sort(key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR)
        if date_candidates:
            setprop(data, p, date_candidates[0])

    return json.dumps(data)

예제 #18

0

파일 보기

파일: replace_substring.py 프로젝트: amber-reichert/ingestion

def replace_substring(body, ctype, prop=None, old=None, new=None):
    """Replaces a substring in prop

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to apply replacing
    old -- the substring to replace
    new -- the substring to replaced old with
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not old or not new:
        logger.error("No old or new parameters were provided")
    else:
        if exists(data, prop):
            v = getprop(data, prop)
            setprop(data, prop, v.replace(old, new))

    return json.dumps(data)

예제 #19

0

파일 보기

파일: set_prop.py 프로젝트: amber-reichert/ingestion

def set_prop(body, ctype, prop=None, value=None, condition_prop=None,
             condition_value=None):
    """Sets the value of prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to set
    value -- the value to set prop to
    condition_prop -- (optional) the field that must exist to set the prop
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if not value:
        logger.error("No value was supplied to set_prop.")
    else:
        # If there is no condition_prop, set the prop, creating it if it does
        #not exist. If there is a condition_prop, only set the prop if the
        # condition_prop exists.
        if not condition_prop or exists(data, condition_prop):
            setprop(data, prop, value)

    return json.dumps(data)

예제 #20

0

파일 보기

파일: digital_commonwealth_enrich_location.py 프로젝트: amber-reichert/ingestion

def digital_commonwealth_enrich_location(body, ctype, action="digital_commonwealth_enrich_location", prop="sourceResource/spatial"):
    """
    Service that massages a Digital Commonwealth JSON document.
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    # Strings which are present in the spatial field, which do end up being geocoded, 
    #  but are not locations
    NON_SPATIALS = ["Aerial views.",
                    "Church history.", 
                    "Dwellings",
                    "Dwellings.",
                    "History",
                    "Pictorial works"]

    if (exists(data, prop)): 
        # Spatial field is simply a list of strings, convert to a list 
        #  of dictionaries with the name key set to the string value
        spatials = []
        for spatial in iterify(getprop(data, prop)):
            if (isinstance(spatial, basestring) \
                and spatial not in NON_SPATIALS):
                spatials.append({"name": format_spatial(spatial)})
                
        setprop(data, prop, spatials)

    return json.dumps(data)

예제 #21

0

파일 보기

파일: artstor_cleanup_creator.py 프로젝트: chadfennell/ingestion

def artstor_cleanup_creator(body, ctype, prop="sourceResource/creator"):
    """
    Service that accepst a JSON document and removes cleans the
    sourceResource/creator field by removing the values in REGEXES if the
    field value begins with them
    """

    try:
        assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (HTTP_HEADER_TYPE, HTTP_TYPE_JSON)
        data = json.loads(body)
    except Exception as e:
        error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e))
        logger.exception(error_text)
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return error_text

    if exists(data, prop):
        item = getprop(data, prop)
        if not isinstance(item, list):
            item = [item]
        for i in range(len(item)):
            for s in CLEANUP:
                item[i] = re.sub(r"(?i)^{0}".format(s), "", item[i].strip()).lstrip()
            
        setprop(data, prop, item[0] if len(item) == 1 else item)

    return json.dumps(data)

예제 #22

0

파일 보기

파일: pa_mapper.py 프로젝트: dpla/ingestion

 def map_contributor(self):
     prop = "contributor"
     if exists(self.provider_data, prop):
         contributors = iterify(self.provider_data.get(prop))
         setprop(self.mapped_data, "dataProvider", contributors[-1])
         if len(contributors) > 1:
             self.update_source_resource({"contributor": contributors[:-1]})

예제 #23

0

파일 보기

파일: oai_mods_to_dpla.py 프로젝트: peterkingalex/ingestion

def oaimodstodpla(body, ctype, geoprop=None, provider=None):
    """
    Convert output of JSON-ified OAI MODS format into the DPLA JSON-LD format.

    Parameter "geoprop" specifies the property name containing lat/long coords
    """

    try :
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type","text/plain")
        return "Unable to parse body as JSON"

    global GEOPROP
    GEOPROP = geoprop

    out = {
        "@context": CONTEXT,
        "sourceResource": {}
    }

    if provider == "BPL":
        data = remove_key_prefix(data, "mods:")

    # Apply all transformation rules from original document
    transformer_pipeline = {}
    transformer_pipeline.update(CHO_TRANSFORMER.get(provider, {}),
                                **CHO_TRANSFORMER["common"])
    for p in transformer_pipeline:
        if exists(data, p):
            out["sourceResource"].update(transformer_pipeline[p](data, p))
    transformer_pipeline = {}
    transformer_pipeline.update(AGGREGATION_TRANSFORMER.get(provider, {}),
                                **AGGREGATION_TRANSFORMER["common"])
    for p in transformer_pipeline:
        if exists(data, p):
            out.update(transformer_pipeline[p](data, p))

    # Apply transformations that are dependent on more than one
    # original document field
    if provider == "HARVARD":
        out["sourceResource"].update(identifier_transform_harvard(data))
        out.update(url_transform_harvard(data))
        out.update(data_provider_transform_harvard(data))

    # Join dataProvider with isPartOf for BPL
    if provider == "BPL":
        try:
            ipo = getprop(out, "dataProvider") + ". " + \
                  getprop(out, "sourceResource/isPartOf")
            setprop(out, "sourceResource/isPartOf", ipo.replace("..", "."))
        except:
            pass

    # Strip out keys with None/null values?
    out = dict((k,v) for (k,v) in out.items() if v)

    return json.dumps(out)

예제 #24

0

파일 보기

파일: mwdl_enrich_location.py 프로젝트: amber-reichert/ingestion

def format_spatial(spatial):
    name = getprop(spatial, "name")
    for regex, repl in REGEX_REPLACEMENTS: 
        if (regex.search(name)): 
            name = regex.sub(repl, name).strip()
            setprop(spatial, "name", name)

    return spatial

예제 #25

0

파일 보기

파일: hathi_mapper.py 프로젝트: dpla/ingestion

 def map_provider(self, _dict, tag, codes):
     values = self._get_values(_dict, codes)
     if "HT" in values and "avail_ht" in values:
         provider = {
             "@id": "http://dp.la/api/contributor/hathitrust",
             "name": "HathiTrust"
         }
         setprop(self.mapped_data, "provider", provider)

예제 #26

0

파일 보기

파일: marc_mapper.py 프로젝트: dpla/ingestion

 def update_title(self):
     prop = "sourceResource/title"
     title_list = filter(None, getprop(self.mapped_data, prop))
     if title_list:
         title = [" ".join(t) for t in title_list]
         setprop(self.mapped_data, prop, title)
     else:
         delprop(self.mapped_data, prop)

예제 #27

0

파일 보기

파일: hathi_mapper.py 프로젝트: dpla/ingestion

    def map_data_provider(self, _dict, tag, codes):
        data_provider = []
        for v in self._get_values(_dict, codes):
            namespace = v.split(".")[0]
            data_provider.append(self.data_provider_mapping.get(namespace))

        data_provider = filter(None, data_provider)
        if data_provider:
            setprop(self.mapped_data, "dataProvider", data_provider)

예제 #28

0

파일 보기

파일: texas_enrich_location.py 프로젝트: amber-reichert/ingestion

def texas_enrich_location(body, ctype, action="texas_enrich_location",
                          prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of
    that document.

    For use with the texas profile
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"


    def _get_coordinates(value):
        lat, lon = None, None
        for v in value.split(";"):
            if "north=" in v:
                lat = v.split("=")[-1]
            elif "east=" in v:
                lon = v.split("=")[-1]

        if lat and lon:
            return (lat, lon)
        else:
            return ()

    if exists(data, prop):
        spatial = []
        values = getprop(data,prop)

        for v in values:
            sp = {"name": v}
            shredded = [s.strip() for s in v.split(" - ")]

            coordinates = _get_coordinates(sp["name"]) 
            if coordinates:
                sp["name"] = "%s, %s" % coordinates

            if len(shredded) < 5:
                if not re.search("\d", sp["name"]):
                    sp["country"] = shredded[0]
                if "country" in sp:
                    if sp["country"] in ["United States", "Canada"]:
                        try:
                            sp["state"] = shredded[1]
                            sp["county"] = shredded[2]
                            sp["city"] = shredded[3]
                        except Exception, e:
                            logger.debug("Error enriching location %s: %s" %
                                         (data["_id"], e))
            spatial.append(sp)
        logger.debug("SPATIAL: %s" % spatial)
        setprop(data, prop, spatial)

예제 #29

0

파일 보기

파일: marc_mapper.py 프로젝트: dpla/ingestion

    def extend_prop(self, prop, _dict, codes, label=None, values=None):
        if values is None:
            values = self._get_values(_dict, codes)

        if values:
            if label:
                values.insert(0, label)
            prop_value = self._get_mapped_value(prop)
            prop_value.extend(self._join_values(prop, values))
            setprop(self.mapped_data, prop, prop_value)

예제 #30

0

파일 보기

파일: hathi_mapper.py 프로젝트: dpla/ingestion

 def update_is_shown_at(self):
     prop = "sourceResource/identifier"
     if exists(self.mapped_data, prop):
         for v in iterify(getprop(self.mapped_data, prop)):
             if v.startswith("Hathi: "):
                 _id = v.split("Hathi: ")[-1]
                 is_shown_at = "http://catalog.hathitrust.org/Record/%s" % \
                               _id
                 setprop(self.mapped_data, "isShownAt", is_shown_at)
                 break

예제 #31

0

파일 보기

def artstor_cleanup(body, ctype):

    try:
        assert ctype.lower() == HTTP_TYPE_JSON, "%s is not %s" % (
            HTTP_HEADER_TYPE, HTTP_TYPE_JSON)
        data = json.loads(body)
    except Exception as e:
        error_text = "Bad JSON: %s: %s" % (e.__class__.__name__, str(e))
        logger.exception(error_text)
        response.code = HTTP_INTERNAL_SERVER_ERROR
        response.add_header(HTTP_HEADER_TYPE, HTTP_TYPE_TEXT)
        return error_text

    data_provider_key = u"dataProvider"
    if exists(data, data_provider_key):
        item = getprop(data, data_provider_key)
        if isinstance(item, basestring):
            cleaned_data_provider = item.replace("Repository:", "").lstrip()
            setprop(data, data_provider_key, cleaned_data_provider)

    return json.dumps(data)

예제 #32

0

파일 보기

파일: test_geocode.py 프로젝트: mlhale7/ingestion

def test_geocode_skip_united_states():
    """Should not add coordinates when name or country value is 
    'United States' or 'États-Unis' or 'USA'
    """
    INPUT = {
        "id": "12345",
        "_id": "12345",
        "sourceResource": {
            "spatial": ""
        }
    }

    url = server() + "geocode"
    for v in ["United States", "United States.", u"États-Unis", 
              u"États-Unis.", "USA"]:
        for field in ["name", "country"]:
            setprop(INPUT, "sourceResource/spatial", {field: v})
            resp, content = H.request(url, "POST", body=json.dumps(INPUT))
            assert resp.status == 200
            for place in json.loads(content)['sourceResource']['spatial']:
                assert 'coordinates' not in place.keys()

예제 #33

0

파일 보기

파일: set_ucldc_dataprovider.py 프로젝트: calisphere-legacy-harvester/dpla-ingestion

def set_ucldc_dataprovider(body, ctype):
    '''For ucldc, we always have a originalRecord/collection entry.
    This has a repository object which may or may not have a list of 
    campuses.
    Concatenate the repo & campus if exisiting, separated by a ,
    for dataProvider value
    '''
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    collection = getprop(data, 'originalRecord/collection')[0]
    repo = collection['repository'][0]
    campus = None
    if len(repo['campus']):
        campus = repo['campus'][0]
    dataProvider = repo['name']
    if campus:
        dataProvider = ', '.join((campus['name'], repo['name']))
    setprop(data, 'dataProvider', dataProvider)
    data['provider'] = {}
    setprop(data, 'provider/name', dataProvider)
    setprop(data, 'provider/@id', collection['@id'])
    data['sourceResource']['stateLocatedIn'] = [{'name': 'California'}]
    return json.dumps(data)

예제 #34

0

파일 보기

파일: required_values_from_collection_registry.py 프로젝트: calisphere-legacy-harvester/dpla-ingestion

def set_field_from_value_mode(data, field, mode, value, multivalue=True):
    '''Set the value for the data "field" from data in collection
    ckey field with the value passed in.
    '''
    logger.debug('Field:{} mode:{} value:{} mv:{}'.format(field, mode, value, multivalue))
    if value: #no value don't bother
        if mode=='overwrite':
            if exists(data, field):
                setprop(data, field, value)
            else:
                pp,pn = tuple(field.lstrip('/').split('/',1))
                if not pp in data:
                    data[pp] = {}
                data[pp][pn] = value
        elif mode=='append':
            new_value = []
            if exists(data, field):
                old_value = getprop(data, field)
                if isinstance(old_value, list):
                    new_value.extend(old_value)
                else:
                    new_value.append(old_value)
            if isinstance(value, list):
                new_value.extend(value)
            else:
                new_value.append(value)
            setprop(data, field, new_value)
        else: # fill blanks
            if not exists(data, field) or not getprop(data,
                    field,keyErrorAsNone=True):
                if multivalue and not isinstance(value, list):
                    value = [value]
                setprop(data, field, value)
    return data

예제 #35

0

파일 보기

def geocode_region(spatial):
    setprop(spatial, "coordinates",
            "%s, %s" % REGIONS[getprop(spatial, "name")])
    delprop(spatial, "county")
    setprop(spatial, "state", "South Carolina")
    setprop(spatial, "country", "United States")
    return spatial

예제 #36

0

파일 보기

파일: artstor_spatial_to_dataprovider.py 프로젝트: chadfennell/ingestion

def artstor_spatial_to_dataprovider(body, ctype,
                                    prop="sourceResource/spatial"):
    """ Splits spatial on semicolon and copies the first value to dataProvider
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        if isinstance(v, list):
            v = v[0]
        if isinstance(v, basestring):
            v = v.split(";")[0]    
            setprop(data, "dataProvider", v)
        delprop(data, prop)

    return json.dumps(data)

예제 #37

0

파일 보기

파일: enrich_location.py 프로젝트: calisphere-legacy-harvester/dpla-ingestion

def enrichlocation(body,ctype,action="enrich_location", prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document by
    iterating through the spatial fields and mapping to the state and iso3166-2, if not already
    mapped, through teh get_isostate function. This function takes the optional parameter abbrev,
    and if it is set it will search the fields for State name abbreviations. If a previous provider-
    specific location enrichment module ran, the default is to not search those fields for State name
    abbreviations, but only for full State names.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data,prop):
        v = iterify(getprop(data,prop))

        for i in range(len(v)):
            if isinstance(v[i], dict):
                for k in v[i].keys():
                    v[i][k] = remove_space_around_semicolons(v[i][k])
            else:
                v[i] = {"name": remove_space_around_semicolons(v[i])}

        # If any of the spatial fields contain semi-colons, we need to create
        # multiple dictionaries.
        semicolons = None
        for d in v:
            for k in d.keys():
                if d[k] and ';' in d[k]:
                    semicolons = True
                    break

        setprop(data,prop,(create_dictionaries(v) if semicolons else v))

    return json.dumps(data)

예제 #38

0

파일 보기

def bhlcontributortocollection(body,
                               ctype,
                               contributor_field="sourceResource/contributor"):
    """ Copies BHL contributor field value to collection field
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, contributor_field):
        contributor = getprop(data, contributor_field)
        acronym = "".join(c[0] for c in contributor.split())

        setprop(data, "sourceResource/collection/@id",
                "http://dp.la/api/collections/bhl--" + acronym)
        setprop(data, "sourceResource/collection/name", contributor)

    return json.dumps(data)

예제 #39

0

파일 보기

def mdlstatelocatedin(body, ctype):
    """
    Service that accepts a JSON document and extracts the state from the
    address in the first dataProvider value
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    prop = "dataProvider"
    if exists(data, prop):
        address = iterify(getprop(data, prop))[0]
        for st, state in states.items():
            if (re.search("\s+%s\s+" % st, address)
                    or re.search("\s+%s\s+" % state, address)):
                setprop(data, "sourceResource/stateLocatedIn", state)
                break

    return json.dumps(data)

예제 #40

0

파일 보기

def nara_enrich_location(body,
                         ctype,
                         action="nara_enrich_location",
                         prop="sourceResource/spatial"):
    """
    Service that massages a NARA JSON document.
    """
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if (exists(data, prop)):
        # Check spatial dictionaries to see if they are valid
        spatials = []
        for spatial in iterify(getprop(data, prop)):
            spatials.append(format_spatial(spatial))

        setprop(data, prop, spatials)

    return json.dumps(data)

예제 #41

0

파일 보기

def remove_list_values(body, ctype, prop=None, values=None):
    """Given a comma-separated string of values, removes any instance of each
       value from the prop.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    v = getprop(data, prop, True)

    if isinstance(v, list) and values is not None:
        values = values.split(",")
        v = [s for s in v if s not in values]
        if v:
            setprop(data, prop, v)
        else:
            delprop(data, prop)

    return json.dumps(data)

예제 #42

0

파일 보기

파일: jsonfy_prop.py 프로젝트: calisphere-legacy-harvester/dpla-ingestion

def jsonfy_prop(body, ctype, prop=None):
    """ Some data is packed as strings that contain json. (UCSD)
    Take the data in the given property and turn any sub-values that can be
    read by json.loads into json object.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if prop:
        obj = getprop(data, prop, True)
    else:
        obj = data

    obj_jsonfied = jsonfy_obj(obj)
    if prop:
        setprop(data, prop, obj_jsonfied)
    else:
        data = obj_jsonfied
    return json.dumps(data)

예제 #43

0

파일 보기

def filter_path(_dict, path):
    """
    Repeatedly runs cleaner function until all empty values are removed from given path (hash stops changing).
    Arguments:
     _dict - dictionary to clean;
     path - a xpath-like path to the value, that must be checked
    Returns:
     cleaned dictionary
    """
    d = copy.deepcopy(_dict)
    embracing_path, sep, value_key = path.rpartition(PATH_DELIM)
    try:
        dict_to_clean = getprop(d, embracing_path)
    except KeyError:
        logger.warning("Attempt to clean non existent path \"%s\"",
                       embracing_path)
        return _dict
    else:
        if value_key:
            cleaned_dict = filter_dict(dict_to_clean, filter_fields, value_key)
            setprop(d, embracing_path, cleaned_dict)
            return d
        else:
            return filter_dict(dict_to_clean, filter_fields, embracing_path)

예제 #44

0

파일 보기

def shred(body, ctype, action="shred", prop=None, delim=';', keepdup=None):
    """
    Service that accepts a JSON document and "shreds" or "unshreds" the value
    of the field(s) named by the "prop" parameter

    "prop" can include multiple property names, delimited by a comma (the delim
    property is used only for the fields to be shredded/unshredded). This
    requires that the fields share a common delimiter however.
    """

    try:
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON\n" + str(e)

    def mismatch_parens(s):
        return s.count("(") != s.count(")")

    for p in prop.split(','):
        if exists(data, p):
            v = getprop(data, p)
            if action == "shred":
                if isinstance(v, list):
                    try:
                        v = delim.join(v)
                    except Exception as e:
                        logger.error("Can't join on delim. ID: %s\n%s" %
                                     (data["_id"], str(e)))
                if delim in v:
                    setprop(data, p, v)
                else:
                    continue

                shredded = [""]
                for s in re.split(re.escape(delim), v):
                    if mismatch_parens(shredded[-1]):
                        shredded[-1] += "%s%s" % (delim, s)
                    else:
                        shredded.append(s)
                shredded = [i.strip() for i in shredded if i.strip()]
                if not keepdup:
                    result = []
                    for s in shredded:
                        if s not in result:
                            result.append(s)
                    shredded = result
                setprop(data, p, shredded)
            elif action == "unshred":
                if isinstance(v, list):
                    setprop(data, p, delim.join(v))

    return json.dumps(data)

예제 #45

0

파일 보기

파일: enrich_date.py 프로젝트: calisphere-legacy-harvester/dpla-ingestion

def convert_dates(data, prop, earliest):
    """Converts dates.

    Arguments:
    data Dict - Data for conversion.
    prop Str - Properties dividided with comma.
    earliest Bool - True - the function will set only the earliest date.
    False - the function will set all dates.

    Returns:
    Nothing, the replacement is done in place.
    """
    for p in prop.split(','):
        dates = []
        if exists(data, p):
            v = getprop(data, p)
            if isinstance(v, list):  # fix for duplicate values in list
                v = list(OrderedDict.fromkeys(v))
            if not isinstance(v, dict) and len(v):
                if is_year_range_list(v):
                    dates.append({
                        "begin": v[0],
                        "end": v[-1],
                        "displayDate": "%s-%s" % (v[0], v[-1])
                    })
                else:
                    for s in (v if not isinstance(v, basestring) else [v]):
                        for part in s.split(";"):
                            display_date = remove_single_brackets_and_strip(
                                part)
                            stripped = clean_date(
                                remove_all_brackets_and_strip(part))
                            # Stripping bogus -00-00 data
                            if stripped[-6:] == "-00-00":
                                stripped = stripped[:-6]
                                display_date = stripped
                            if len(stripped) < 4:
                                continue
                            a, b = parse_date_or_range(stripped)
                            if b != DEFAULT_DATETIME_STR:
                                dates.append({
                                    "begin": a,
                                    "end": b,
                                    "displayDate": display_date
                                })
            else:
                # Already filled in, probably by mapper
                continue

            dates.sort(
                key=lambda d: d["begin"] if d["begin"] is not None
                else DEFAULT_DATETIME_STR
            )
            if dates:
                ###                if earliest:
                ###                    value_to_set = dates[0]
                ###                else:
                ###                    value_to_set = dates
                ###                setprop(data, p, value_to_set)
                setprop(data, p, dates)
            else:
                delprop(data, p)

예제 #46

0

파일 보기

def shred(body, ctype, action="shred", prop=None, delim=';', keepdup=None):
    """
    Service that accepts a JSON document and "shreds" or "unshreds" the value
    of the field(s) named by the "prop" parameter

    "prop" can include multiple property names, delimited by a comma (the delim
    property is used only for the fields to be shredded/unshredded). This
    requires that the fields share a common delimiter however.

    The 'shred' action splits values by delimeter. It handles some complex edge
    cases beyond what split() expects. For example:
      ["a,b,c", "d,e,f"] -> ["a","b","c","d","e","f"]
      'a,b(,c)' -> ['a', 'b(,c)']
    Duplicate values are removed unless keepdup evaluates true.

    The 'unshred' action joins a list of values with delim.

    See: https://issues.dp.la/issues/2940
         https://issues.dp.la/issues/4251
         https://issues.dp.la/issues/4266
         https://issues.dp.la/issues/4578
         https://issues.dp.la/issues/4600
    """
    try:
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON\n" + str(e)

    def index_for_first_open_paren(values):
        """
        Accepts a list of values. Returns the index of the index of the first 
        value containing an opening paren.
        """
        for v in values:
            if v.count("(") > v.count(")"):
                return values.index(v)
        return None

    def index_for_matching_close_paren(values):
        """
        Accepts a list of values. Returns the index of the index of the first 
        value containing a closing paren.
        """
        index = None
        for v in values:
            if index is not None and v.count("(") > v.count(")"):
                return index
            elif v.count(")") > v.count("("):
                index = values.index(v)
        return index

    def rejoin_partials(values, delim):
        """
        Accepts a list of values which have been split by delim. Searches for 
        values that have been separated 

        For example, this value:
          'my (somewhat contrived; value) with a delimeter enclosed in parens'
        would be split into: 
          ['my (somewhat contrived', 'value) with a delimeter enclosed in parens']
       
        This method rejoins it.
        """
        index1 = index_for_first_open_paren(values)
        index2 = index_for_matching_close_paren(values)
        if index1 is not None and index2 is not None:
            if index1 == 0 and index2 == len(values) - 1:
                return [delim.join(values)]
            elif index1 == 0:
                values = [delim.join(values[:index2 + 1])
                          ] + values[index2 + 1:]
            elif index2 == len(values) - 1:
                values = values[:index1] + [delim.join(values[index1:])]
            else:
                values = values[:index1] + [
                    delim.join(values[index1:index2 + 1])
                ] + values[index2 + 1:]
            return rejoin_partials(values, delim)
        else:
            return values

    for p in prop.split(','):
        if exists(data, p):
            v = getprop(data, p)
            if action == "shred":
                if isinstance(v, list):
                    v = filter(None, v)
                    try:
                        v = delim.join(v)
                        v = v.replace("%s%s" % (delim, delim), delim)
                    except Exception as e:
                        logger.warn("Can't join list %s on delim for %s, %s" %
                                    (v, data["_id"], e))
                if delim in v:
                    setprop(data, p, v)
                else:
                    continue

                shredded = [""]
                for s in re.split(re.escape(delim), v):
                    shredded.append(s)
                shredded = rejoin_partials(shredded, delim)
                shredded = [i.strip() for i in shredded if i.strip()]

                if not keepdup:
                    result = []
                    for s in shredded:
                        if s not in result:
                            result.append(s)
                    shredded = result
                setprop(data, p, shredded)
            elif action == "unshred":
                if isinstance(v, list):
                    setprop(data, p, delim.join(v))

    return json.dumps(data)

예제 #47

0

파일 보기

def copyprop(body,
             ctype,
             prop=None,
             to_prop=None,
             create=False,
             key=None,
             remove=None,
             no_replace=None,
             no_overwrite=None):
    """Copies value in one prop to another prop.

    Keyword arguments:
    body -- the content to load
    ctype -- the type of content
    prop -- the prop to copy from (default None)
    to_prop -- the prop to copy into (default None)
    create -- creates to_prop if True (default False)
    key -- the key to use if to_prop is a dict (default None)
    remove  -- removes prop if True (default False)
    no_replace -- creates list of to_prop string and appends prop if True
    
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, to_prop) and no_overwrite:
        pass
    else:
        if exists(data, prop) and create and not exists(data, to_prop):
            val = {} if key else ""
            setprop(data, to_prop, val)

        if exists(data, prop) and exists(data, to_prop):
            val = getprop(data, prop)
            to_element = getprop(data, to_prop)

            if isinstance(to_element, basestring):
                if no_replace:
                    el = [to_element] if to_element else []
                    el.append(val)
                    # Flatten
                    val = [
                        e for s in el
                        for e in (s if not isinstance(s, basestring) else [s])
                    ]
                setprop(data, to_prop, val)
            else:
                # If key is set, assume to_element is dict or list of dicts
                if key:
                    if not isinstance(to_element, list):
                        to_element = [to_element]
                    for dict in to_element:
                        if exists(dict, key) or create:
                            setprop(dict, key, val)
                        else:
                            msg = "Key %s does not exist in %s" % (key,
                                                                   to_prop)
                            logger.debug(msg)
                else:
                    # Handle case where to_element is a list
                    if isinstance(to_element, list):
                        if isinstance(val, list):
                            to_element = to_element + val
                        else:
                            to_element.append(val)
                        setprop(data, to_prop, to_element)
                    else:
                        # to_prop is dictionary but no key was passed.
                        msg = "%s is a dictionary but no key was passed" % to_prop
                        logger.warn(msg)
                        setprop(data, to_prop, val)

            if remove:
                delprop(data, prop)

    return json.dumps(data)

예제 #48

0

파일 보기

파일: enrich_language.py 프로젝트: calisphere-legacy-harvester/dpla-ingestion

def enrich_language(body,
                    ctype,
                    action="enrich_language",
                    prop="sourceResource/language"):
    """
    Service that accepts a JSON document and sets the language ISO 639-3
    code(s) and language name from the current language value(s) by:

    a) Checking if the value is a language code, else
    a) Attempting to convert value the value from ISO 639-1 to ISO639-3, else
    c) Attempting to find an exact language name match, else
    d) Attempting to find language name matches withing the value
    """
    def iso1_to_iso3(s):
        s = re.sub("[-_/].*$", "", s).strip()
        return ISO639_1.get(s, s)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        language_strings = [v] if not isinstance(v, list) else v

        iso_codes = []
        for lang_string in language_strings:
            # Check if raw value is a code
            if lang_string not in iso_codes and lang_string in ISO639_3_SUBST:
                iso_codes.append(lang_string)
            else:
                # If lang_string is an ISO 639-1 code, convert to ISO 639-3
                iso3 = iso1_to_iso3(
                    re.sub("[\.\[\]\(\)]", "", lang_string).lower().strip())
                if iso3 not in iso_codes and iso3 in ISO639_3_SUBST:
                    iso_codes.append(iso3)
                else:
                    # First check for exact language name matches
                    for iso_code, regex in EXACT_LANGUAGE_NAME_REGEXES.items():
                        match = regex.match(lang_string.strip())
                        if match:
                            iso_codes.append(iso_code)
                            break

                    if match is None:
                        # Check for language names with word boundary regex
                        for iso_code, regex in WB_LANGUAGE_NAME_REGEXES.items(
                        ):
                            if regex.search(lang_string):
                                iso_codes.append(iso_code)

        if iso_codes:
            seen = set()
            language = [{
                "iso639_3": code,
                "name": ISO639_3_SUBST[code]
            } for code in iso_codes if not (code in seen or seen.add(code))]
            setprop(data, prop, language)
        else:
            logger.warning("Did not find language code in [%s] for record %s" %
                           (language_strings, data["_id"]))
            delprop(data, prop)

    return json.dumps(data)

예제 #49

0

파일 보기

파일: enrich-format.py 프로젝트: chadfennell/ingestion

def enrichformat(body,
                 ctype,
                 action="enrich-format",
                 prop="sourceResource/format",
                 type_field="sourceResource/type"):
    """
    Service that accepts a JSON document and enriches the "format" field of
    that document by: 

    a) Setting the format to be all lowercase
    b) Running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg)
    c) Checking to see if the field is a valid IMT
       See http://www.iana.org/assignments/media-types for list of valid
       media-types. We require that a subtype is defined.
    d) Removing any extra text after the IMT
    e) Moving valid IMT values to hasView/format if hasView exists and
       its format is not set
    f) Setting type field from format field, if it is not set. The format field
       is taken if it is a string, or the first element if it is a list. It is
        then split and the first part of IMT is taken.

    By default works on the 'sourceResource/format' field but can be overridden
    by passing the name of the field to use as the 'prop' parameter.
    """

    FORMAT_2_TYPE_MAPPINGS = {
        "audio": "sound",
        "image": "image",
        "video": "moving image",
        "text": "text"
    }

    REGEXPS = ('audio/mp3', 'audio/mpeg'), ('images/jpeg', 'image/jpeg'), \
              ('image/jpg', 'image/jpeg'), ('image/jp$', 'image/jpeg'), \
              ('img/jpg', 'image/jpeg'), ('^jpeg$', 'image/jpeg'), \
              ('^jpg$', 'image/jpeg'), ('\W$', '')
    IMT_TYPES = [
        'application', 'audio', 'image', 'message', 'model', 'multipart',
        'text', 'video'
    ]

    def get_ext(s):
        ext = os.path.splitext(s)[1].split('.')

        return ext[1] if len(ext) == 2 else ""

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
            s = re.sub(r"^([a-z0-9/]+)\s.*", r"\1", s)
        return s

    def is_imt(s):
        logger.debug("Checking: " + s)
        imt_regexes = [re.compile('^' + x + '(/)') for x in IMT_TYPES]
        return any(regex.match(s) for regex in imt_regexes)

    try:
        data = json.loads(body)
    except Exception as e:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON\n" + str(e)

    imt_values = []
    if exists(data, prop):
        v = getprop(data, prop)
        format = []
        hasview_format = []

        for s in (v if not isinstance(v, basestring) else [v]):
            if s.startswith("http") and is_absolute(s):
                s = get_ext(s)
            cleaned = cleanup(s)
            if is_imt(cleaned):
                # Append to imt_values for use in type
                imt_values.append(cleaned)
                # Move IMT values to hasView/format else discard
                if exists(data, "hasView") and not \
                    exists(data, "hasView/format") and \
                                cleaned not in hasview_format:
                    hasview_format.append(cleaned)
            else:
                # Retain non-IMT values in sourceResource/format, non-cleaned
                if s not in format:
                    format.append(s)

        if format:
            if len(format) == 1:
                format = format[0]
            setprop(data, prop, format)
        else:
            delprop(data, prop)

        if hasview_format:
            if len(hasview_format) == 1:
                hasview_format = hasview_format[0]
            setprop(data, "hasView/format", hasview_format)

    # Setting the type if it is empty.
    if not exists(data, type_field) and imt_values:
        type = []
        for imt in imt_values:
            t = getprop(FORMAT_2_TYPE_MAPPINGS, imt.split("/")[0], True)
            if t and t not in type:
                type.append(t)

        if type:
            if len(type) == 1:
                type = type[0]
            setprop(data, type_field, type)

    return json.dumps(data)

예제 #50

0

파일 보기

def all_transform(d, p):
    global PROVIDER
    logger.debug("TRANSFORMING %s" % d["_id"])

    # For spec_type use
    control_008_28 = None
    datafield_086_or_087 = None

    data = {
        "sourceResource": {
            "identifier": [],
            "contributor": [],
            "creator": [],
            "date": [],
            "description": [],
            "extent": [],
            "language": [],
            "spatial": [],
            "publisher": [],
            "isPartOf": [],
            "rights": [],
            "stateLocatedIn": [],
            "subject": [],
            "temporal": [],
            "title": [None, None, None],
            "format": [],
            "type": [],
            "specType": []
        }
    }

    # Mapping dictionaries for use with datafield:
    # Keys are used to check if there is a tag match. If so, the value provides
    # a list of (property, code) tuples. In the case where certain tags have
    # prominence over others, the tuples will be of the form
    # (property, index, code). To exclude a code, prefix it with a "!":
    # [("format", "!cd")] will exclude the "c" and "d" codes (see def
    # _get_values). 
    data_map = {
        lambda t: t == "856":           [("isShownAt", "u")],
        lambda t: t == "973":           [("provider", "ab")],
        lambda t: t == "974":           [("dataProvider", "u")],
        lambda t: t == "852":           [("dataProvider", "a")]
    }
    source_resource_map = {
        lambda t: t in ("020", "022",
                        "035"):         [("identifier", "a")],
        lambda t: t == "050":           [("identifier", "ab")],
        lambda t: t in ("100", "110",
                        "111"):         [("creator", None)],
        lambda t: t == "041":           [("language", "a")],
        lambda t: t == "260":           [("date", "c"), ("publisher", "ab")],
        lambda t: t == "270":           [("stateLocatedIn", "c")],
        lambda t: t == "300":           [("extent", "ac")],
        lambda t: t in ("337", "338"):  [("format", "a")],
        lambda t: t == "340":           [("format", "a"), ("extent", "b")],
        lambda t: t.startswith("5"):    [("description", "a")],
        lambda t: t in ("506", "540"):  [("rights", None)],
        lambda t: t == "648":           [("temporal", None)],
        lambda t: t in ("700", "710",
                        "711", "720"):  [("contributor", None)],
        #lambda t: t == "662":          [("sourceResource/spatial", None)],
        lambda t: t == "240":           [("title", 2, None)],
        lambda t: t == "242":           [("title", 1, None)],
        lambda t: t == "245":           [("title", 0, "!c")],
        lambda t: t == "970":           [("type", "a")],
        lambda t: t == "651":           [("spatial", "a")],
        lambda t: int(t) in set([600, 650, 651] +
                            range(610, 620) +
                            range(653, 659) +
                            range(690, 700)):   [("subject", None),
                                                 ("format", "v"),
                                                 ("temporal", "y"),
                                                 ("spatial", "z")],
        lambda t: (760 <= int(t) <= 787):       [("isPartOf", None)],

    }

    # Handle datafield
    for item in _as_list(getprop(d, p)):
        for _dict in _as_list(item):
            tag = _dict.get("tag", None)
            # Skip cases where there is no tag or where tag == "ERR"
            try:
                int(tag)
            except:
                continue
            # Handle data_map matches
            for match, tuples in data_map.iteritems():
                if match(tag):
                    for tup in tuples:
                        prop, codes = tup
                        values = _get_values(_dict, codes)
                        if prop == "provider":
                            data.update(provider_transform(values))
                        elif prop == "dataProvider":
                            if tag == "974" and PROVIDER == "hathitrust":
                                dp = dataprovider_transform_hathi(values)
                                data.update(dp)
                            elif tag == "852" and PROVIDER == "uiuc":
                                if values:
                                    data["dataProvider"] = values[0]
                        else:
                            if values:
                                data[prop] = values[0]
            # Handle source_resource_map matches
            for match, tuples in source_resource_map.iteritems():
                if match(tag):
                    for tup in tuples:
                        if len(tup) == 2:
                            prop, codes = tup
                            if prop == "contributor":
                                # Handle values for contributor
                                values = _get_contributor_values(_dict, codes)
                            elif prop == "subject":
                                # Handle values for subject
                                values = _get_subject_values(_dict, tag)
                            elif prop == "spatial":
                                # Handle values for spatial
                                values = _get_spatial_values(_dict, tag, codes)
                            else:
                                # Handle values for all other sourceResource
                                # fields
                                values = _get_values(_dict, codes)
                            if prop == "identifier":
                                # Handle identifier labeling
                                label = None
                                if tag == "020":
                                    label = "ISBN:"
                                elif tag == "022":
                                    label = "ISSN:"
                                elif tag == "050":
                                    label = "LC call number:"
                                if label:
                                    # Insert label as first value item as
                                    # values will be joined
                                    values.insert(0, label)
                            values = _join_sourceresource_values(prop, values)
                            if prop == "type":
                                data["sourceResource"].update(
                                    datafield_type_transform(values)
                                )
                            else:
                                data["sourceResource"][prop].extend(values)
                        elif len(tup) == 3:
                            prop, index, codes = tup
                            values = _get_values(_dict, codes)
                            data["sourceResource"][prop][index] = values 
            if tag == "662":
                # Test: Log document with 662 (spatial)
                logger.debug("Document has 662: %s" % d["_id"])
            elif tag == "086" or tag == "087":
                datafield_086_or_087 = True

    # Handle sourceResource/title
    title = filter(None, data["sourceResource"]["title"])
    if title:
        for i in range(len(title)):
            title[i] = " ".join(title[i])
        data["sourceResource"]["title"] = title
    else:
        del data["sourceResource"]["title"]

    # Handle controlfield: values from here are needed to update
    # sourceResource/identifier, sourceResource/language, and
    # sourceResource/format
    format_char_control = None
    format_char_leader = None
    for item in _as_list(getprop(d, "controlfield")):
        if "#text" in item and "tag" in item:
            # Map tag 001 only for Hathi
            if item["tag"] == "001" and PROVIDER == "hathitrust":
                value = "Hathi: " + item["#text"]
                data["sourceResource"]["identifier"].append(value)
            if item["tag"] == "007":
                # For format use
                format_char_control = item["#text"][0]
            if item["tag"] == "008":
                if len(item["#text"]) > 28:
                    # For spec_type use
                    control_008_28 = item["#text"][28]
                if len(item["#text"]) > 37:
                    data["sourceResource"]["language"].append(
                        item["#text"][35:38]
                    )
    leader = getprop(d, "leader")
    if len(leader) > 6:
        format_char_leader = leader[6]

    format_values = format_transform(format_char_control, format_char_leader)
    data["sourceResource"]["format"].extend(format_values)
        
    # Split language
    language = []
    for lang_str in data["sourceResource"]["language"]:
        language.extend([lang_str[i:i+3] for i in range(0, len(lang_str), 3)])
    data["sourceResource"]["language"] = language


    # Add "Government Document" to spec_type if applicable
    gov_spec_type = get_gov_spec_type(control_008_28, datafield_086_or_087)
    if gov_spec_type:
        data["sourceResource"]["specType"].append(gov_spec_type)

    # Remove empty sourceResource values
    del_keys = [key for key in data["sourceResource"] if not
                data["sourceResource"][key]]
    for key in del_keys:
        del data["sourceResource"][key]

    # Handle Hathi isShownAt
    is_shown_at = None
    for id in _as_list(getprop(data, "sourceResource/identifier")):
        if id.startswith("Hathi: "):
            id = id.split("Hathi: ")[-1]
            is_shown_at = "http://catalog.hathitrust.org/Record/%s" % id
            break
    if is_shown_at:
        setprop(data, "isShownAt", is_shown_at)

    return data

예제 #51

0

파일 보기

파일: enrich_location.py 프로젝트: chadfennell/ingestion

def enrichlocation(body,
                   ctype,
                   action="enrich_location",
                   prop="sourceResource/spatial"):
    """
    Service that accepts a JSON document and enriches the "spatial" field of that document by
    iterating through the spatial fields and mapping to the state and iso3166-2, if not already
    mapped, through teh get_isostate function. This function takes the optional parameter abbrev,
    and if it is set it will search the fields for State name abbreviations. If a previous provider-
    specific location enrichment module ran, the default is to not search those fields for State name
    abbreviations, but only for full State names.
    """

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)

        # If prior provider-specific location enrichment occured,
        # v[0] will be a dictrionary
        if isinstance(v[0], dict):
            for k in v[0].keys():
                v[0][k] = remove_space_around_semicolons(v[0][k])
            """
            if 'state' in v[0]:
                # Handle case where a previous provider-specific location
                # enrichment set the state field
                isostate = get_isostate(v[0]['state'])
                # It may be the case that the 'state' field does not contain a
                # State name
                if isostate[0]:
                    v[0]['iso3166-2'] = isostate[0]
                    v[0]['state'] = isostate[1]
                else:
                    # We may want to keep whatever non-State value was placed in
                    # state
                    v[0]['name'] = v[0]['state']
                    # Remove bogus state
                    del v[0]['state']
            else:
                # Handle case where a previous provider-specific location
                # enrichment did not set the state field
                for val in v[0].values():
                    isostate = get_isostate(val)
                    if isostate[0]:
                        v[0]['iso3166-2'] = isostate[0]
                        v[0]['state'] = isostate[1]
                        break
            """
        else:
            # Handle the case where no previous provider-specific location
            # enrichment occured. Convert spatial from list of strings to
            # dictionary.
            sp = []
            for s in (v if not isinstance(v, basestring) else [v]):
                d = {}
                d['name'] = remove_space_around_semicolons(s)
                """
                isostate = get_isostate(d['name'], abbrev="Yes")
                if isostate[0]:
                    d['iso3166-2'] = isostate[0]
                    d['state'] = isostate[1]
                """
                sp.append(d)
            v = sp

        # If any of the spatial fields contain semi-colons, we need to create
        # multiple dictionaries.
        semicolons = None
        for d in v:
            for k in d.keys():
                if d[k] and ';' in d[k]:
                    semicolons = True
                    break

        setprop(data, prop, (create_dictionaries(v) if semicolons else v))

    return json.dumps(data)

예제 #52

0

파일 보기

파일: pa_mapper.py 프로젝트: mlhale7/ingestion

 def map_intermediate_provider(self):
     prop = "source"
     if exists(self.provider_data, prop):
         im_prov = getprop(self.provider_data, prop)
         if im_prov:
             setprop(self.mapped_data, "intermediateProvider", im_prov)

예제 #53

0

파일 보기

def enrichtype(body,
               ctype,
               action="enrich-type",
               prop="sourceResource/type",
               format_field="sourceResource/format"):
    """   
    Service that accepts a JSON document and enriches the "type" field of that
    document by: 

    a) making the type lowercase
    b) converting "image" to "still image"
      (TODO: Amy to confirm that this is ok)
    c) applying a set of regexps to do data cleanup (remove plural forms)
    d) moving all items that are not standard DC types to the
       sourceResource/format
       (http://dublincore.org/documents/resource-typelist/)
    
    By default works on the 'type' field, but can be overridden by passing the
    name of the field to use as a parameter
    """

    REGEXPS = ('images','image'), ('still image','image'),\
              ('textual records', 'text'),\
              ('photographs and other graphic materials', 'image'),\
              ('texts', 'text')
    DC_TYPES = [
        'collection', 'dataset', 'event', 'image', 'still image',
        'interactive resource', 'moving image', 'physical object', 'service',
        'software', 'sound', 'text'
    ]

    def cleanup(s):
        s = s.lower().strip()
        for pattern, replace in REGEXPS:
            s = re.sub(pattern, replace, s)
        return s

    def is_dc_type(s):
        return s in DC_TYPES

    try:
        data = json.loads(body)
    except Exception:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        dctype = []
        f = getprop(data, format_field) if exists(data, format_field) else []
        if not isinstance(f, list):
            f = [f]

        for s in (v if not isinstance(v, basestring) else [v]):
            if is_dc_type(cleanup(s)):
                dctype.append(cleanup(s))
            else:
                f.append(s)

        if dctype:
            if len(dctype) == 1:
                dctype = dctype[0]
            setprop(data, prop, dctype)
        else:
            delprop(data, prop)

        if len(f) > 1:
            setprop(data, format_field, f)
        elif len(f) == 1:
            setprop(data, format_field, f[0])

    return json.dumps(data)

예제 #54

0

파일 보기

파일: couch.py 프로젝트: calisphere-legacy-harvester/dpla-ingestion

 def update_ingestion_doc(self, ingestion_doc, **kwargs):
     for prop, value in kwargs.items():
         setprop(ingestion_doc, prop, value)
     self.dashboard_db.save(ingestion_doc)

예제 #55

0

파일 보기

파일: cleanup_language.py 프로젝트: chadfennell/ingestion

def cleanup_language(body,
                     ctype,
                     action="cleanup_language",
                     prop="sourceResource/language"):
    """
    Service that accepts a JSON document and cleans each value of the language
    field of that document by:

    a) stripping periods, brackets and parentheses
    b) convert from ISO 639-1 to ISO 639-3
    c) looking for matches in the value using LANGUAGE_NAME_REGEXES
    """
    def iso1_to_iso3(s):
        s = re.sub("[-_/].*$", "", s).strip()
        return ISO639_1.get(s, s)

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header("content-type", "text/plain")
        return "Unable to parse body as JSON"

    if exists(data, prop):
        v = getprop(data, prop)
        v = [v] if not isinstance(v, list) else v

        languages = []
        for s in v:
            if s not in languages and s in ISO639_3_SUBST:
                languages.append(s)
            else:
                s = re.sub("[\.\[\]]", "", s).lower().strip()
                iso = re.sub("[\(\)]", "", s)
                # First convert iso1 to iso3
                iso = iso1_to_iso3(iso)
                if iso in ISO639_3_SUBST and iso not in languages:
                    languages.append(iso)
                else:
                    for n in iso.split(" "):
                        # Since we split on whitespace, we only want to check
                        # against single word reference names so we use
                        # ISO639_3_1
                        n = n.title()
                        if n in ISO639_3_1.values() and n not in languages:
                            languages.append(n)

                    # Use s (with parentheses intact)
                    match = [
                        r.search(s).group() for r in LANGUAGE_NAME_REGEXES
                        if r.search(s)
                    ]
                    if match:
                        languages += list(
                            set([m.strip().title()
                                 for m in match]) - set(languages))

        if languages:
            # Remove duplicates
            lang = []
            [
                lang.append(l) for l in languages
                if ISO639_3_SUBST.get(l, None) not in languages
            ]
            setprop(data, prop, filter(None, lang))
        else:
            delprop(data, prop)

    return json.dumps(data)

예제 #56

0

파일 보기

def geocode(body, ctype, prop="sourceResource/spatial", newprop='coordinates'):
    '''
    Adds geocode data to the record coming as follows:

    1. If the coordinates property does not exist, attempt to extract it from
       name.
    2. Run GeoNames enrichment, reverse encoding coordinate values to identify,
       parent features, or (if none exist) searching for name values. Put
       parent features in appropriate state/country values.
    3. If we still haven't identified the place, use Bing to get lat/long
       values. If one is found, pass the coordinates through Geonames again
       to identify parent features.
    4. Add any non-existing features to the spatial dictionary.
    '''
    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if (not exists(data, prop)):
        pass
    else:
        logger.debug("Geocoding %s" % data["_id"])
        value = getprop(data, prop)
        places = []
        for v in iterify(value):
            bing_geocode = True
            if not isinstance(v, dict):
                logger.error("Spatial value must be a dictionary; record %s" %
                             data["_id"])
                continue

            place = Place(v)

            if place.name:
                coords = get_coordinates(place.name)
                if coords:
                    place.coordinates = coords
                    place.name = None
                    place.set_name()

            # Run Geonames enrichment to do initial search
            place.enrich_geodata(DplaGeonamesGeocoder())

            # Don't enrich with geodata if place is 'United States'
            pattern = ur" *(United States(?!-)|États-Unis|USA)"
            if (place.name and re.search(pattern, place.name)):
                bing_geocode = False

            if bing_geocode:
                # Attempt to find this item's lat/lng coordinates
                if not place.coordinates:
                    api_key = module_config().get("bing_api_key")
                    place.enrich_geodata(DplaBingGeocoder(api_key=api_key))
                    # rerun geonames enrichment with new coordinates
                    place.enrich_geodata(DplaGeonamesGeocoder())

            if not place.validate():
                if not place.set_name():
                    logger.error("Spatial dictionary must have a " +
                                 "'name' property. Could not enhance input " +
                                 "data to include a name property; " +
                                 "record %s" % data["_id"])

            places.append(place)

        values = map(lambda x: x.to_map_json(), Place.merge_related(places))
        setprop(data, prop, values)

    return json.dumps(data)

예제 #57

0

파일 보기

파일: move_date_values.py 프로젝트: mlhale7/ingestion

def movedatevalues(body, ctype, action="move_date_values", prop=None,
                   to_prop="sourceResource/temporal"):
    """
    Service that accepts a JSON document and moves any dates found in the prop
    field to the temporal field.
    """

    if not prop:
        logger.error("Prop param is None in %s" % __name__)
        return body

    REGSEARCH = [
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}\s*[-/]\s*\d{4}\s*[-/]\s*\d{1,2}",
        "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
        "\d{4}\s*[-/]\s*\d{4}",
        "\d{1,2}\s*[-/]\s*\d{4}",
        "\d{4}\s*[-/]\s*\d{1,2}",
        "\d{4}s?",
        "\d{1,2}\s*(?:st|nd|rd|th)\s*century",
        ".*circa.*"
        ]

    def cleanup(s):
        s = re.sub("[\(\)\.\?]", "",s)
        return s.strip()

    try:
        data = json.loads(body)
    except:
        response.code = 500
        response.add_header('content-type', 'text/plain')
        return "Unable to parse body as JSON"

    if exists(data, prop):
        values = iterify(getprop(data, prop))
        remove = []
        toprop = iterify(getprop(data, to_prop)) if exists(data, to_prop) \
                 else []
        
        for v in iterify(values):
            if isinstance(v, basestring):
                c = cleanup(v)
                for pattern in REGSEARCH:
                    m = re.compile(pattern, re.I).findall(c)
                    if len(m) == 1 and not re.sub(m[0], "", c).strip():
                        if m[0] not in toprop:
                            toprop.append(m[0])
                        # Append the non-cleaned value to remove
                        remove.append(v)
                        break

        if toprop:
            setprop(data, to_prop, toprop)
            if len(values) == len(remove):
                delprop(data, prop)
            else:
                setprop(data, prop, [v for v in values if v not in remove])
            

    return json.dumps(data)

예제 #58

0

파일 보기

파일: hathi_mapper.py 프로젝트: calisphere-legacy-harvester/dpla-ingestion

 def add_identifier(self, value):
     prop = "sourceResource/identifier"
     identifier = self._get_mapped_value(prop)
     identifier.append("Hathi: " + value)
     setprop(self.mapped_data, prop, identifier)