def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' model.repo.new_revision() master_data = json.loads(harvest_object.content) domain = master_data['domain'] group = Group.get(domain) if not group: group = Group(name=domain, description=domain) if 'records' in master_data: records = master_data['records'] set_name = master_data['set_name'] for rec in records: identifier, metadata, _ = rec if metadata: name = metadata['title'][0] if len(metadata['title'])\ else identifier title = name norm_title = unicodedata.normalize('NFKD', name)\ .encode('ASCII', 'ignore')\ .lower().replace(' ', '_')[:35] slug = ''.join(e for e in norm_title if e in string.ascii_letters + '_') name = slug creator = metadata['creator'][0]\ if len(metadata['creator']) else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg = Package.by_name(name) if not pkg: pkg = Package(name=name, title=title) extras = {} for met in metadata.items(): key, value = met if len(value) > 0: if key == 'subject' or key == 'type': for tag in value: if tag: tag = munge_tag(tag[:100]) tag_obj = model.Tag.by_name(tag) if not tag_obj: tag_obj = model.Tag(name=tag) if tag_obj: pkgtag = model.PackageTag( tag=tag_obj, package=pkg) Session.add(tag_obj) Session.add(pkgtag) else: extras[key] = ' '.join(value) pkg.author = creator pkg.author_email = creator pkg.title = title pkg.notes = description pkg.extras = extras pkg.url = \ "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\ % (harvest_object.job.source.url, identifier) pkg.save() harvest_object.package_id = pkg.id Session.add(harvest_object) setup_default_user_roles(pkg) url = '' for ids in metadata['identifier']: if ids.startswith('http://'): url = ids title = metadata['title'][0] if len(metadata['title'])\ else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg.add_resource(url, description=description, name=title) group.add_package_by_name(pkg.name) subg_name = "%s - %s" % (domain, set_name) subgroup = Group.by_name(subg_name) if not subgroup: subgroup = Group(name=subg_name, description=subg_name) subgroup.add_package_by_name(pkg.name) Session.add(group) Session.add(subgroup) setup_default_user_roles(group) setup_default_user_roles(subgroup) model.repo.commit() else: self._save_object_error('Could not receive any objects from fetch!' , harvest_object, stage='Import') return False return True
def import_stage(self, harvest_object): """Import the metadata received in the fetch stage to a dataset and create groups if ones are defined. Fill in metadata from study and document description. """ try: xml_dict = {} xml_dict["source"] = harvest_object.content udict = json.loads(harvest_object.content) if "url" in udict: f = urllib2.urlopen(udict["url"]).read() ddi_xml = BeautifulSoup(f, "xml") else: self._save_object_error("No url in content!", harvest_object) return False except urllib2.URLError: self._save_object_error("Could not fetch from url %s!" % udict["url"], harvest_object) return False except etree.XMLSyntaxError: self._save_object_error("Unable to parse XML!", harvest_object) return False model.repo.new_revision() study_descr = ddi_xml.codeBook.stdyDscr document_info = ddi_xml.codeBook.docDscr.citation title = study_descr.citation.titlStmt.titl.string if not title: title = document_info.titlStmt.titl.string name = study_descr.citation.titlStmt.IDNo.string update = True pkg = Package.get(name) if not pkg: pkg = Package(name=name) update = False producer = study_descr.citation.prodStmt.producer if not producer: producer = study_descr.citation.rspStmt.AuthEnty if not producer: producer = study_descr.citation.rspStmt.othId pkg.author = producer.string pkg.maintainer = producer.string if study_descr.citation.distStmt.contact: pkg.maintainer = study_descr.citation.distStmt.contact.string if document_info.titlStmt.IDNo: pkg.id = document_info.titlStmt.IDNo.string keywords = study_descr.stdyInfo.subject(re.compile("keyword|topcClas")) keywords = list(set(keywords)) for kw in keywords: if kw: vocab = None kw_str = "" if kw.string: kw_str = kw.string if "vocab" in kw.attrs: vocab = kw.attrs.get("vocab", None) if vocab and kw.string: kw_str = vocab + " " + kw.string pkg.add_tag_by_name(munge_tag(kw_str)) if study_descr.stdyInfo.abstract: description_array = study_descr.stdyInfo.abstract("p") else: description_array = study_descr.citation.serStmt.serInfo("p") pkg.notes = "<br />".join([description.string for description in description_array]) pkg.title = title[:100] pkg.url = udict["url"] if not update: ofs = get_ofs() nowstr = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f") idno = study_descr.citation.titlStmt.IDNo agencyxml = (idno["agency"] if "agency" in idno.attrs else "") + idno.string label = "%s/%s.xml" % (nowstr, agencyxml) ofs.put_stream(BUCKET, label, f, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Original metadata record", format="xml", size=len(f)) pkg.add_resource( url=document_info.holdings["URI"] if "URI" in document_info.holdings else "", description=title ) metas = {} descendants = [desc for desc in document_info.descendants] + [sdesc for sdesc in study_descr.descendants] for docextra in descendants: if isinstance(docextra, Tag): if docextra: if docextra.name == "p": docextra.name = docextra.parent.name if not docextra.name in metas and docextra.string: metas[docextra.name] = docextra.string if docextra.string else self._collect_attribs(docextra) else: if docextra.string: metas[docextra.name] += ( " " + docextra.string if docextra.string else self._collect_attribs(docextra) ) if ddi_xml.codeBook.dataDscr and not update: vars = ddi_xml.codeBook.dataDscr("var") heads = self._get_headers() c_heads = ["ID", "catValu", "labl", "catStat"] f_var = StringIO.StringIO() c_var = StringIO.StringIO() varwriter = csv.DictWriter(f_var, heads) codewriter = csv.DictWriter(c_var, c_heads) heading_row = {} for head in heads: heading_row[head] = head c_heading_row = {} for head in c_heads: c_heading_row[head] = head varwriter.writerow(heading_row) codewriter.writerow(c_heading_row) for var in vars: try: varwriter.writerow(self._construct_csv(var, heads)) codewriter.writerows(self._create_code_rows(var)) except ValueError, e: raise IOError("Failed to import DDI to CSV! %s" % e) f_var.flush() label = "%s/%s_var.csv" % (nowstr, name) ofs.put_stream(BUCKET, label, f_var, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Variable metadata", format="csv", size=f_var.len) label = "%s/%s_code.csv" % (nowstr, name) ofs.put_stream(BUCKET, label, c_var, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Variable code values", format="csv", size=c_var.len) f_var.seek(0) reader = csv.DictReader(f_var) for var in reader: metas[var["ID"]] = var["labl"] if "labl" in var else var["qstnLit"]