def prepareFile(package_id, resource_id, sheet_id=None, options={}): sheetInfo = collections.get("resource").find_one({ "resourceId" : resource_id, "sheetId" : sheet_id }) if sheetInfo is None: sheetInfo = {} # get the name of the resource if 'name' in sheetInfo: resource = sheetInfo else: # fallback on querying PG for the name resource = ckanResourceQuery.get(resource_id) # see if we have the path, otherwise lookup it up if 'file' in sheetInfo: filepath = sheetInfo.get('file') else: filepath = resourceUtil.get_path(resource_id) ext = _getFileExtension(resource.get('name')) # much like in the prepare() method aboves resource loop if ext == "zip": extractZip(package_id, resource.get('id'), filepath, resource.get('name'), options=options) else: importer.processFile(filepath, package_id, resource_id, sheetId=sheet_id, options=options, resource=resource)
def prepare(package_id, force=False): packageInfo = collections.get("package").find_one({ "packageId" : package_id, },{"_id" : 0}) # create a workspace package object if required if packageInfo is None: packageInfo = { "packageId" : package_id } # quit if there is nothing todo if not force and packageInfo.get("prepared") == True: return { "success" : True, "message" : "already prepared, use force flag to force prepare" } # make sure we are not trying to prepare a package that has been deleted ckanPackage = ckanPackageQuery.get(package_id) if ckanPackage.get('state') == 'deleted': raise Exception('Package has been deleted') # get all package resources resources = ckanResourceQuery.active(package_id) status = [] for resource in resources: # get path on disk for file as well as file extension filepath = resourceUtil.get_path(resource.get('id')) ext = _getFileExtension(resource.get('name')) # extract zip contents if zip if ext == "zip": # TODO: we should be checking a zip hash before we go unzipping every time results = extractZip(package_id, resource.get('id'), filepath, resource.get('name')) for result in results: status.append(result) # extract 'normal' file (non-zip) else: result = importer.processFile(filepath, package_id, resource.get('id'), resource=resource) status.append(result) # respond with update of what we did (or did not) do. packageInfo["runInfo"] = status packageInfo["lastTouched"] = datetime.utcnow() packageInfo["prepared"] = True collections.get("package").update({"packageId":package_id}, packageInfo, upsert=True) return packageInfo
def extractZip(package_id, resource_id, zipPath, zipName, options={}): status = [] # check to see if there are any changes zipFileInfo = collections.get("resource").find_one({ "packageId" : package_id, "resourceId" : resource_id }) if zipFileInfo is None: zipFileInfo = {} hash = importer.hashfile(zipPath) # if hashes are equal, we nothing has changed if zipFileInfo.get("hash") == hash: status.append({ "resourceId" : resource_id, "name" : zipName, "unzipped" : False, "message" : "nothing todo, hash is equal" }) return status # Send info back about what was processed zipFileInfo['hash'] = hash zipFileInfo['resourceId'] = resource_id zipFileInfo['packageId'] = package_id zipFileInfo['file'] = zipPath zipFileInfo['isZip'] = True # update resource collection collections.get("resource").update({ "packageId" : package_id, "resourceId" : resource_id }, zipFileInfo, upsert=True) status.append({ "resourceId" : resource_id, "name" : zipName, "unzipped" : True }) # get the workspace path on disk workspacePath = os.path.join(workspaceDir, package_id, resource_id) # clean out any existing extraction if os.path.exists(workspacePath): shutil.rmtree(workspacePath) z = zipfile.ZipFile(zipPath, "r") zipPackageIds = [] for info in z.infolist(): if _isDataFile(info.filename): # create id for individual file name = re.sub(r".*/", "", info.filename) if re.match(r"^\..*", name): # ignore .dot files continue id = _getZipResourceId(resource_id, info.filename) #extract individual file z.extract(info, workspacePath) # check for existing config resource = collections.get("resource").find_one({ "packageId" : package_id, "resourceId" : id }) # create new config if one doesn't exist if resource is None: resource = { "packageId" : package_id, "resourceId" : id, "name" : name, "file" : os.path.join(workspacePath, info.filename), "zip" : { "name" : zipName, "resourceId" : resource_id }, "fromZip" : True } collections.get("resource").update({ "packageId" : package_id, "resourceId" : id }, resource, upsert=True) zipPackageIds.append(id) # now we pass with new resource id, but path to file result = importer.processFile(resource.get('file'), package_id, id, resource=resource, options=options) status.append(result) # TODO: implement .ecosis file # cleanup collections.get("resource").remove({ "packageId" : package_id, "zip.resourceId" : resource_id, "resourceId" : { "$nin" : zipPackageIds } }) # more cleanup collections.get("spectra").remove({ "packageId" : package_id, "zip.resourceId" : resource_id, "resourceId" : { "$nin" : zipPackageIds } }) return status