def map_type(self): """Get type from objectType or object_type element Specifically, look at freetext/objectType[label=Type] and indexedStructured/object_type. """ object_type_strings = [] phys_type_strings = [] ot_ccase = self.extract_xml_items("freetext", "objectType") phys_desc = self.extract_xml_items("freetext", "physicalDescription") ot_uscore = self.extract_xml_items("indexedStructured", "object_type") for pd in phys_desc: pd_text = pd.get("#text", "").strip() if pd_text: phys_type_strings.append(pd_text.lower()) for ot in ot_ccase: if ot.get("@label", "") == "Type": s = ot.get("#text", "").strip() if s: object_type_strings.append(s.lower()) for ot in ot_uscore: s = ot.strip() if s: object_type_strings.append(s.lower()) try: new_type = itemtype.type_for_strings_and_mappings( [(phys_type_strings, self.type_for_phys_keyword), (object_type_strings, self.type_for_ot_keyword)] ) except itemtype.NoTypeError: id_for_msg = self.provider_data.get("_id", "[no _id]") logger.warning("Can not deduce type for item with _id: %s" % id_for_msg) new_type = "image" self.update_source_resource({"type": new_type})
def map_type(self): """Get type from objectType or object_type element Specifically, look at freetext/objectType[label=Type] and indexedStructured/object_type. """ object_type_strings = [] phys_type_strings = [] ot_ccase = self.extract_xml_items("freetext", "objectType") phys_desc = self.extract_xml_items("freetext", "physicalDescription") ot_uscore = self.extract_xml_items("indexedStructured", "object_type") for pd in phys_desc: pd_text = pd.get("#text", "").strip() if pd_text: phys_type_strings.append(pd_text.lower()) for ot in ot_ccase: if ot.get("@label", "") == "Type": s = ot.get("#text", "").strip() if s: object_type_strings.append(s.lower()) for ot in ot_uscore: s = ot.strip() if s: object_type_strings.append(s.lower()) try: new_type = itemtype.type_for_strings_and_mappings([ (phys_type_strings, self.type_for_phys_keyword), (object_type_strings, self.type_for_ot_keyword) ]) except itemtype.NoTypeError: id_for_msg = self.provider_data.get("_id", "[no _id]") logger.warning("Can not deduce type for item with _id: %s" % id_for_msg) new_type = 'image' self.update_source_resource({"type": new_type})
def enrichtype(body, ctype, action="enrich-type", prop="sourceResource/type", format_field="sourceResource/format", default=None, send_rejects_to_format=False): """ Service that accepts a JSON document and enriches the "type" field of that document by: By default works on the 'type' field, but can be overridden by passing the name of the field to use as a parameter. A default type, if none can be determined, may be specified with the "default" querystring parameter. If no default is given, the type field will be unmodified, or not added, in the result. """ global type_for_type_keyword, type_for_format_keyword try: data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" type_strings = [] format_strings = [] try: sr_type = data['sourceResource'].get('type', []) sr_format = data['sourceResource'].get('format', []) except KeyError: # In this case, sourceResource is not present, so give up and return # the original data unmodified. id_for_msg = data.get('_id', '[no id]') logger.warning('enrich-type lacks sourceResource for _id %s' % \ id_for_msg) return body if sr_type: for t in sr_type if (type(sr_type) == list) else [sr_type]: t_flat = t if type(t) == dict: t_flat = t.get('#text', None) if not t_flat: t_flat = t.get('text', '') type_strings.append(t_flat.lower()) if sr_format: for f in sr_format if (type(sr_format) == list) else [sr_format]: format_strings.append(f.lower()) try: data['sourceResource']['type'] = \ itemtype.type_for_strings_and_mappings([ (type_strings, type_for_type_keyword), (format_strings, type_for_format_keyword), ]) except itemtype.NoTypeError: id_for_msg = data.get('_id', '[no id]') logger.warning('Can not deduce type for item with _id: %s' % \ id_for_msg) if default: data['sourceResource']['type'] = default else: try: del data['sourceResource']['type'] except: pass finally: if send_rejects_to_format and type_strings: rej = itemtype.rejects([(type_strings, type_for_type_keyword)]) if rej: if (not isinstance(sr_format, list)): sr_format = [sr_format] sr_format.extend(rej) data['sourceResource']['format'] = sr_format return json.dumps(data)
def enrichtype(body, ctype, action="enrich-type", prop="sourceResource/type", format_field="sourceResource/format", default=None, send_rejects_to_format=False): """ Service that accepts a JSON document and enriches the "type" field of that document by: By default works on the 'type' field, but can be overridden by passing the name of the field to use as a parameter. A default type, if none can be determined, may be specified with the "default" querystring parameter. If no default is given, the type field will be unmodified, or not added, in the result. """ global type_for_type_keyword, type_for_format_keyword try : data = json.loads(body) except Exception: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" type_strings = [] format_strings = [] try: sr_type = data['sourceResource'].get('type', []) sr_format = data['sourceResource'].get('format', []) except KeyError: # In this case, sourceResource is not present, so give up and return # the original data unmodified. id_for_msg = data.get('_id', '[no id]') logger.warning('enrich-type lacks sourceResource for _id %s' % \ id_for_msg) return body if sr_type: for t in sr_type if (type(sr_type) == list) else [sr_type]: if type(t) == dict: t = t.get('#text', '') if t is not None: type_strings.append(t.lower()) if sr_format: for f in sr_format if (type(sr_format) == list) else [sr_format]: if f is not None: format_strings.append(f.lower()) try: data['sourceResource']['type'] = \ itemtype.type_for_strings_and_mappings([ (format_strings, type_for_format_keyword), (type_strings, type_for_type_keyword) ]) except itemtype.NoTypeError: id_for_msg = data.get('_id', '[no id]') logger.warning('Can not deduce type for item with _id: %s' % \ id_for_msg) if default: data['sourceResource']['type'] = default else: try: del data['sourceResource']['type'] except: pass finally: if send_rejects_to_format and type_strings: rej = itemtype.rejects([(type_strings, type_for_type_keyword)]) if rej: if (not isinstance(sr_format, list)): sr_format = [sr_format] sr_format.extend(rej) data['sourceResource']['format'] = sr_format return json.dumps(data)