def ingest_series(self, naId=None, dest=None, offset=0): """Ingests a series into Drastic.""" if naId is None or dest is None: raise Exception("URL and destination path are required") app.check_traversal_okay(self) # Get series description series_json = requests.get(SERIES_URL.format(naId)).json() series_descr = series_json['opaResponse']['results']['result'][0]['description'] # Create folder dirname = series_descr['series']['title'] new_folder_path = dest + dirname + '/' # Check if folder exists exists_res = get_client().get_cdmi(new_folder_path) if exists_res.code() == 404: logger.info("Creating base folder in Drastic: "+new_folder_path) res = get_client().put_cdmi(new_folder_path, series_descr) if not res.ok(): raise IOError(str(res)) logger.info("Base folder created: "+new_folder_path) # Schedule page 0 schedule_page.s([], naId=naId, dest=new_folder_path, offset=offset).apply_async()
def incr_batch_progress(batch_dir, file_cnt=0, file_bytes_cnt=0, folder_cnt=0, done=False): # Get existing metadata in Drastic res = get_client().get_cdmi(batch_dir) if not res.ok(): raise IOError("Drastic get_cdmi failed: {0}".format(res.msg())) metadata = res.json()['metadata'] progress_file_old = metadata.get('batch_file_progress', 0) progress_file = file_cnt + int(progress_file_old) progress_file_bytes_old = metadata.get('batch_file__bytes_progress', 0) progress_file_bytes = file_bytes_cnt + int(progress_file_bytes_old) progress_folder_old = metadata.get('batch_folder_progress', 0) progress_folder = folder_cnt + int(progress_folder_old) metadata['batch_file_progress'] = progress_file metadata['batch_file_bytes_progress'] = progress_file_bytes metadata['batch_folder_progress'] = progress_folder if done: metadata['batch_state'] = 'done' metadata['batch_epoch_end'] = int(time.time()) r = get_client().put(batch_dir, metadata=metadata) if not r.ok(): raise IOError(str(r))
def pollForTextConversion(self, path, link): """Tries to download text when available.""" headers = { 'Accept': 'text/plain', 'Authorization': "Basic {0}".format(dap_auth_encoded) } try: r = requests.get(link, headers=headers) r.raise_for_status() res = get_client().get_cdmi(str(path)) if res.code() in [404, 403]: logger.warn( "Dropping task for object that gives a 403/403: {0}".format( path)) return if not res.ok(): raise IOError("Drastic get_cdmi failed: {0}".format(res.msg())) cdmi_info = res.json() metadata = cdmi_info['metadata'] metadata['fulltext'] = r.text res = get_client().put(path, metadata=metadata) if res.code() in [404, 403]: # object probably deleted logger.warn( "Dropping task for an object that gives a 403/403: {0}".format( path)) return if not res.ok(): raise IOError(str(res)) except IOError as e: raise self.retry(exc=e)
def folders_complete(folder_cnt, batch_dir): logger.info(u"Folders created for batch: {0}".format(batch_dir)) # Get existing metadata in Drastic res = get_client().get_cdmi(batch_dir) if not res.ok(): raise IOError("Drastic get_cdmi failed: {0}".format(res.msg())) metadata = res.json()['metadata'] metadata['batch_folder_progress'] = folder_cnt r = get_client().put(batch_dir, metadata=metadata) if not r.ok(): raise IOError(str(r))
def ingest_httpfile(self, url, destPath, name=None): """Ingests the file at the given URL into Drastic.""" parsed = urlparse(url) if name is None: name = basename(parsed.path) name = name.replace('&', '_') tempfilename = None try: tempfilename = download_tempfile(url) except IOError as e: os.remove(tempfilename) raise self.retry(exc=e) try: logger.debug(u"Downloaded file to: " + tempfilename) with closing(open(tempfilename, 'rb')) as f: res = get_client().put(destPath + '/' + name, f) if res.code() in [406, 999]: return if not res.ok(): raise IOError('Failed to put {} to {}. Got {} {}'.format( tempfilename, destPath + '/' + name, res.code(), res.msg())) logger.debug(u"put success for {0}".format(destPath + name)) finally: os.remove(tempfilename)
def batch_ingest_httpdir(self, url=None, dest=None): """Batches the folders and files under the path given, using the NGINX JSON directory autoindex.""" epoch_start = int(time.time()) # Create top folder in Drastic res = requests.get(url) res.raise_for_status() dirname = urlparse(url).path.split('/')[-2] batch_dir = os.path.join(dest, dirname) + '/' res = get_client().mkdir(batch_dir) if not res.ok(): raise IOError('Cannot make folder {0}: {1}'.format( batch_dir, str(res))) logger.info(u"Batch ingest starting: " + batch_dir) # Schedule a recursive count, then record it in Drastic metadata (file_cnt, file_byte_cnt, folder_cnt) = count_httpdir(url=url) logger.info(u"Batch count complete, {0} files, {1} bytes.".format( file_cnt, file_byte_cnt)) record_batch_count(file_cnt, file_byte_cnt, folder_cnt, epoch_start, batch_dir) mkdirs = mkdirs_httpdir.si(url, batch_dir) # batch_dir /NARA/RG ..... fc = folders_complete.si(folder_cnt, batch_dir) ingest = ingest_files.si(url, batch_dir) (mkdirs | fc | ingest).apply_async()
def record_batch_count(file_cnt, file_bytes_cnt, folder_cnt, epoch_start, batch_dir): # Get existing metadata in Drastic res = get_client().ls(batch_dir) if not res.ok(): raise IOError("Drastic get_cdmi failed: {0}".format(res.msg())) metadata = res.json()['metadata'] metadata['batch_file_count'] = file_cnt metadata['batch_file_bytes_count'] = file_bytes_cnt metadata['batch_folder_count'] = folder_cnt metadata['batch_epoch_start'] = epoch_start metadata['batch_state'] = 'ingesting' metadata['batch_file_progress'] = 0 metadata['batch_file_bytes_progress'] = 0 metadata['batch_folder_progress'] = 0 r = get_client().put(batch_dir, metadata=metadata) if not r.ok(): raise IOError(str(r))
def put_graph_metadata(self, path): """Replaces existing user triples for a single subject.""" logger.debug(u'PUT RDF metadata for {1}'.format(path)) path = path[:-1] if path.endswith('?') else path is_folder = True if str(path).endswith('/') else False try: res = get_client().get_cdmi(str(path)) if res.code() in [404, 403]: logger.warn( "Dropping task for object that gives a 403/403: {0}".format( path)) return if not res.ok(): raise IOError("Drastic get_cdmi failed: {0}".format(res.msg())) cdmi_info = res.json() except IOError as e: raise self.retry(exc=e) # Drastic fields: # FIXME name is not the key, is null name = cdmi_info.get('objectName') name = name[:-1] if name.endswith('?') else name object_UUID = cdmi_info.get('objectID') container_UUID = cdmi_info.get('parentID') # parent_URI = cdmi_info.get('parentURI') mimetype = 'text/directory' if not is_folder: mimetype = cdmi_info.get('mimetype') metadata = cdmi_info.get('metadata') uri = "uuid:{0}".format(object_UUID) get_g().V().has('resource', 'URI', uri).drop().count().next() t = get_g().addV('resource') t = t.property('URI', uri) t = t.property('graph', uri) t = t.property('name', name) t = t.property('mimetype', mimetype) for key, value in metadata.iteritems(): # Don't store metadata without value if value is None: # numeric zero is a valid value continue t = t.property(key, value) # key/values as properties # TODO add default namespace for keys that are plain tokens # t = add_literal_edge(t, uri, key, value) # Add contains Edge if container_UUID is not None: container_uri = "uuid:{0}".format(container_UUID) c = get_g().V().has('resource', 'URI', container_uri) # TODO fully qualify URIs t = t.addE('contains').from_(c) t.next() logging.debug(u'Created resource vertex for {0}'.format(object_UUID))
def traversal(self, path, task_name, only_files, include_pattern=None): """Traverses the file tree under the path given, within the CDMI service. Applies the named task to every path.""" app.check_traversal_okay(self) path = path[:-1] if path.endswith('?') else path try: res = get_client().ls(path) if res.code() in [404, 403]: # object probably deleted logger.warn( "Dropping task for an object that gives a 403/403: {0}".format( path)) return if not res.ok(): raise IOError(str(res)) except IOError as e: raise self.retry(exc=e) cdmi_info = res.json() # logger.debug('got CDMI content: {0}'.format(json.dumps(cdmi_info))) if not cdmi_info[u'objectType'] == u'application/cdmi-container': logger.error("Cannot traverse a file path: {0}".format(path)) return regex_compiled = None if include_pattern is not None: regex_compiled = re.compile(include_pattern) if only_files: for f in cdmi_info[u'children']: f = f[:-1] if f.endswith('?') else f if f.endswith('/'): # filter matches with regex if include_pattern is None or regex_compiled.match( f) is not None: app.send_task(task_name, args=[str(path) + f], kwargs={}) else: for o in cdmi_info[u'children']: o = o[:-1] if o.endswith('?') else o # filter matches with regex if include_pattern is None or regex_compiled.match(f) is not None: app.send_task(task_name, args=[str(path) + o], kwargs={}) for x in cdmi_info[u'children']: x = x[:-1] if x.endswith('?') else x if x.endswith('/'): traversal.s(str(path) + x, task_name, only_files, include_pattern=include_pattern).apply_async()
def mkdirs_httpdir(url, batch_dir): """Counts the folders and files under the path given, using the NGINX JSON directory autoindex.""" count = 0 notifyCount = 20 for (f, parentPath, furl) in iter_httpdir(url, files=False): name = str(f['name']) new_folder_path = os.path.join(batch_dir, parentPath, name) + '/' logger.debug(u'new_folder_path: {0}'.format(new_folder_path)) res = get_client().mkdir(new_folder_path) if not res.ok(): logger.error(u'Cannot make directory: {0}'.format(new_folder_path)) continue count += 1 if count >= notifyCount: incr_batch_progress.s(batch_dir, folder_cnt=count).apply_async() count = 0 incr_batch_progress.s(batch_dir, folder_cnt=count).apply_async()
def ingest_httpdir(self, url=None, dest=None): """Ingests the file tree under the path given, using the NGINX JSON directory autoindex.""" if url is None or dest is None: raise Exception("URL and destination path are required") app.check_traversal_okay(self) # Get directory try: res = requests.get(url) res.raise_for_status() dir_info = res.json() parsed = urlparse(url) dirname = parsed.path.split('/')[-2] new_folder_path = dest + dirname + '/' logger.debug(u"DIRNAME " + new_folder_path) res = get_client().mkdir(new_folder_path) if not res.ok(): raise IOError(str(res)) logger.debug(u"DIRECTORY INGESTED: " + new_folder_path) file_ingests = [] folder_ingests = [] for f in dir_info: if 'file' == f['type']: s = ingest_httpfile.s(str(url) + f['name'], new_folder_path) file_ingests.append(s) elif 'directory' == f['type']: s = ingest_httpdir.s(url=str(url) + f['name'] + '/', dest=new_folder_path) folder_ingests.append(s) file_job = group(file_ingests) file_job.apply_async() # result.join() # wait for files to ingest in parallel # file_count += result.completed_count() group(folder_ingests).apply_async() # for file_c, folder_c in folder_res.get(): # file_count += file_c # folder_count += folder_c # return (file_count, folder_count) except IOError as e: raise self.retry(exc=e)
def ingest_property_cards(self, dest=None): """Ingests a series into Drastic.""" if dest is None: raise Exception("Destination path is required") app.check_traversal_okay(self) url = ("https://catalog.archives.gov/api/v1?q=title:\"property card\"" "&description.fileUnit.parentSeries.naId=3725265" "&type=description" "&resultFields=naId,description,objects" "&rows=200") # FIXME Add the login for NARA CATALOG API # Get series description series_json = requests.get(url).json() for result in series_json['opaResponse']['results']['result']: ingest_tasks = [] # naId = result['naId'] title = result['description']['fileUnit']['title'] new_folder_path = dest + title + '/' res = get_client().mkdir(new_folder_path) if not res.ok(): logger.error('Got and error ({0}) creating folder {1}' .format(str(res), new_folder_path)) raise IOError(str(res)) # si: create folder for obj in result['objects']['object']: file_stuff = obj['file'] idnum = obj['@id'] url = file_stuff['@url'] mime = file_stuff['@mime'] name = str(file_stuff['@name']) s = ingest_httpfile.s(url, new_folder_path, name=name, mimetype=mime, metadata=obj) ingest_tasks.append(s) group(ingest_tasks).apply_async()
def index(self, path): """Reindexes the metadata for a data object""" from index.util import add_BD_fields_legacy, readMaxText path = path[:-1] if path.endswith('?') else path mytype = 'folder' if str(path).endswith('/') else 'file' esdoc = {} esdoc['path'] = str(path) esdoc['pathtext'] = str(path) try: res = get_client().get_cdmi(str(path)) if res.code() in [404, 403]: logger.warn( "Dropping task for object that gives a 403/403: {0}".format( path)) return if not res.ok(): raise IOError("Drastic get_cdmi failed: {0}".format(res.msg())) cdmi_info = res.json() except IOError as e: raise self.retry(exc=e) # Drastic fields: # FIXME name is not the key, is null name = cdmi_info.get('objectName') esdoc['objectName'] = name[:-1] if name.endswith('?') else name esdoc['objectID'] = cdmi_info.get('objectID') esdoc['parentID'] = cdmi_info.get('parentID') esdoc['parentURI'] = cdmi_info.get('parentURI') esdoc['mimetype'] = cdmi_info.get('mimetype') # TODO esdoc['size'] = cdmi_info.get('size') # If we have extracted metadata from Brown Dog, add any mapped fields if 'dts_metadata.jsonld' in cdmi_info.get('metadata'): add_BD_fields_legacy( cdmi_info['metadata'].get('dts_metadata.jsonld', '[]'), esdoc) if 'dts_tags.json' in cdmi_info.get('metadata'): esdoc['dts_tags'] = cdmi_info['metadata'].get('dts_tags.json') # if file mimetype is already text/plain, index it as fulltext if 'text/plain' == cdmi_info.get('mimetype'): try: with closing(stream_from_drastic_proxy(path)) as stream: esdoc['fulltext'] = readMaxText(stream, fulltext_max_index_size) except IOError as e: logger.warn( "Cannot get original object text for indexing: {0}".format( str(e))) elif 'fulltext' in cdmi_info['metadata']: esdoc['fulltext'] = cdmi_info['metadata'].get('fulltext') logger.debug('ESDOC:\n{0}'.format(json.dumps(esdoc))) url = elasticsearch_url + '/drastic/' + mytype try: r = requests.post(url, data=json.dumps(esdoc)) if r.status_code != requests.codes.created: logger.error('ES status: {0} {1}'.format(r.status_code, r.text)) except IOError as e: self.retry(exc=e)
def mkdir(self, path): res = get_client().mkdir(path) if not res.ok(): raise IOError(str(res))
def pollForExtract(self, path, fileid, retries): """Poll the feature extraction service for the results of an extraction. Re-enqueue this task if still waiting.""" url = '{0}/api/extractions/{1}/status?commkey={2}'.format( clowder_url, fileid, clowder_commkey) parsed = None try: r = requests.get(url) r.raise_for_status() parsed = r.json() except IOError as e: raise self.retry(exc=e) extractionStatus = parsed['Status'] doneStatus = ['Done'] failStatus = ['No Extractor Available. Request is not queued.'] waitStatus = [ 'Processing', 'Required Extractor is either busy or' + ' is not currently running. Try after some time.' ] if extractionStatus in waitStatus: raise self.retry() elif extractionStatus in failStatus: msg = 'Extract failed for {0} {1} with {2}'.format( path, fileid, extractionStatus) logger.warn(msg) return elif extractionStatus not in doneStatus: logger.error( 'Unrecognized extraction status for {0} {1} with {2}'.format( path, fileid, extractionStatus)) return try: # Get existing metadata in Drastic res = get_client().get_cdmi(str(path)) if res.code() in [404, 403]: logger.warn( "Dropping task for object that gives a 403/403: {0}".format( path)) return if not res.ok(): raise IOError("Drastic get_cdmi failed: {0}".format(res.msg())) cdmi_info = res.json() metadata = cdmi_info['metadata'] except IOError as e: raise self.retry(exc=e) try: # GET new metadata url = '{0}/api/files/{1}/metadata.jsonld?commkey={2}'.format( clowder_url, fileid, clowder_commkey) r = requests.get(url) r.raise_for_status() parsed = r.json() logger.debug("fetched metadata: {0}".format(json.dumps(parsed))) except IOError as e: raise self.retry(exc=e) # GET new tags try: url2 = '{0}/api/files/{1}/tags?commkey={2}'.format( clowder_url, fileid, clowder_commkey) r2 = requests.get(url2) r2.raise_for_status() tags = r2.json()['tags'] if len(tags) > 0: metadata['dts_tags'] = tags logger.debug("fetched tags: {0}".format(json.dumps(tags))) # Modify existing metadata # Create Clowder ID and link field metadata['dts_clowder_link'] = '{0}/files/{1}/'.format( clowder_url, fileid) metadata['dts_clowder_id'] = fileid metadata['dts_metadata'] = parsed r = get_client().put(path, metadata=metadata) if not r.ok(): raise IOError(str(r)) except IOError as e: raise self.retry(exc=e)