def main(argv=None): parser = argparse.ArgumentParser( description='Print count of objects for a given collection.') parser.add_argument('path', help="Nuxeo path to collection") parser.add_argument( 'since_date', help= "Script will list docs updated since midnight on this date, GMT. Format YYYY-MM-DD", type=valid_date) parser.add_argument('--pynuxrc', default='~/.pynuxrc', help="rcfile for use with pynux utils") parser.add_argument('--components', action='store_true', help="show counts for object components") if argv is None: argv = parser.parse_args() dh = DeepHarvestNuxeo(argv.path, '', pynuxrc=argv.pynuxrc) print "about to fetch docs for path {}".format(dh.path) objects = dh.fetch_objects() component_count = 0 for obj in objects: last_mod_str = obj['lastModified'][:10] last_mod_date = parse(last_mod_str) if last_mod_date > argv.since_date: print last_mod_str, obj['path'] '''
def main(argv=None): parser = argparse.ArgumentParser( description='Print count of objects for a given collection.') parser.add_argument('path', help="Nuxeo path to collection") parser.add_argument('--pynuxrc', default='~/.pynuxrc-prod', help="rcfile for use with pynux utils") parser.add_argument('--components', action='store_true', help="show counts for object components") if argv is None: argv = parser.parse_args() dh = DeepHarvestNuxeo(argv.path, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() object_count = len(objects) print "finished fetching objects. {} found".format(object_count) if not argv.components: return print "about to iterate through objects and get components" component_count = 0 for obj in objects: components = dh.fetch_components(obj) component_count = component_count + len(components) print "finished fetching components. {} found".format(component_count) print "Grand Total: {}".format(object_count + component_count)
def main(argv=None): parser = argparse.ArgumentParser( description='print differences between Nuxeo and CouchDB for a ' 'given collection' ) parser.add_argument('regid', help="Collection Registry ID") parser.add_argument( '--pynuxrc', default='~/.pynuxrc-basic', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.regid couch = get_couch_objects(registry_id) print('couch has {} objects'.format(len(couch))) nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) if nxpath is None: print "No record found for registry_id: {}".format(registry_id) sys.exit() dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) for obj in dh.fetch_objects(): incouch = True if obj['uid'] in couch else False if not incouch: print(obj['uid'])
def deepharvest(self, metadata): ''' given a set of nuxeo metadata for a doc, deep harvest it ''' self.logger.info("Processing {}".format(metadata['uid'])) dh = DeepHarvestNuxeo('') type = dh.get_calisphere_object_type(metadata['type']) self.logger.info("Type: {}".format(type)) report = {} if type == 'image': ''' stash image ''' nxstash = NuxeoStashImage(metadata['path'], IMAGE_BUCKET, IMAGE_REGION, self.pynuxrc, self.replace, metadata=metadata) report[nxstash.uid] = nxstash.nxstashref() print report if type in ['file', 'audio', 'video']: # stash file nxstash = NuxeoStashFile(metadata['path'], FILE_BUCKET, FILE_REGION, self.pynuxrc, self.replace, metadata=metadata) report[nxstash.uid] = nxstash.nxstashref() # stash thumbnail nxstash = NuxeoStashThumb(metadata['path'], THUMB_BUCKET, THUMB_REGION, self.pynuxrc, self.replace, metadata=metadata) report[nxstash.uid] = nxstash.nxstashref() print report # stash media.json '''
def main(argv=None): parser = argparse.ArgumentParser(description='Create ATOM feed for a given Nuxeo folder for Merritt harvesting') parser.add_argument("collection", help="UCLDC Registry Collection ID") parser.add_argument("--pynuxrc", help="rc file for use by pynux") if argv is None: argv = parser.parse_args() collection_id = argv.collection if argv.pynuxrc: ma = MerrittAtom(collection_id, argv.pynuxrc) else: ma = MerrittAtom(collection_id) print "atom_file: {}".format(ma.atom_file) if argv.pynuxrc: dh = DeepHarvestNuxeo(ma.path, '', pynuxrc=argv.pynuxrc) else: dh = DeepHarvestNuxeo(ma.path, '') print "Fetching Nuxeo docs. This could take a while if collection is large..." documents = dh.fetch_objects() # TODO: fetch components also # create root root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP) # add entries for document in documents: nxid = document['uid'] print "constructing entry for {} {}".format(nxid, document['path']) nx_metadata = ma._extract_nx_metadata(nxid) entry = etree.Element(etree.QName(ATOM_NS, "entry")) entry = ma._populate_entry(entry, nx_metadata, nxid) root.insert(0, entry) # add header info print "Adding header info to xml tree" ma._add_merritt_id(root, ma.merritt_id) ma._add_paging_info(root) ma._add_collection_alt_link(root, ma.path) ma._add_atom_elements(root) ma._add_feed_updated(root, ma.last_update) ma._publish_feed(root)
def main(argv=None): ''' create and stash media.json files for a nuxeo collection ''' parser = argparse.ArgumentParser(description='Create and stash media.json' 'files for a nuxeo collection') parser.add_argument("path", help="Nuxeo document path") parser.add_argument( "--bucket", default="static.ucldc.cdlib.org/media_json", help="S3 bucket where media.json files will be stashed") parser.add_argument('--region', default='us-east-1', help="aws region") parser.add_argument("--pynuxrc", default='~/.pynuxrc', help="rc file for use by pynux") parser.add_argument('--replace', action="store_true", help="replace file on s3 if it already exists") if argv is None: argv = parser.parse_args() collection = argv.path.split('/')[-1] # logging logfile = 'logs/mediajson-{}.log'.format(collection) print "LOG:\t{}".format(logfile) logging.basicConfig( filename=logfile, level=logging.INFO, format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger(__name__) dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc) report = {} objects = dh.fetch_objects() for obj in objects: nxstash = NuxeoStashMediaJson(obj['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() # output report to json file reportfile = "reports/mediajson-{}.json".format(collection) with open(reportfile, 'w') as f: json.dump(report, f, sort_keys=True, indent=4) # parse report to give basic stats report = json.load(open(reportfile)) print "REPORT:\t{}".format(reportfile) print "SUMMARY:" print "processed:\t{}".format(len(report))
def main(argv=None): parser = argparse.ArgumentParser( description='list objects for a given collection.') parser.add_argument('registry_id', help='UCLDC Registry ID') parser.add_argument('--pynuxrc', default='~/.pynuxrc-basic', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.registry_id # get nuxeo path nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) if nxpath is None: print "No record found for registry_id: {}".format(registry_id) sys.exit() dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() object_count = len(objects) print "finished fetching objects. {} found".format(object_count) print "about to iterate through objects and get components" component_count = 0 all_components = [] for obj in objects: components = dh.fetch_components(obj) all_components.extend(components) print "{} components for {}".format(len(components), obj['uid']) print "finished fetching components. {} found".format(len(all_components)) objects.extend(all_components) total_obj = len(objects) print "Grand Total: {}".format(total_obj) # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks chunks = [ objects[i:i + PER_PAGE] for i in xrange(0, len(objects), PER_PAGE) ] count = 0 for c in chunks: count = count + 1 filepath = 'chunks/{}_{}.txt'.format(registry_id, count) print "Writing file: {}".format(filepath) with open(filepath, 'w') as f: json.dump(c, f, indent=4)
def __init__(self, path, pynuxrc, replace=False, loglevel=_loglevel_): self.logger = logging.getLogger(__name__) self.path = path self.pynuxrc = pynuxrc self.replace = replace self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=self.pynuxrc) self.objects = self.fetch_objects() self.components = {} for obj in self.objects: self.components[obj['uid']] = self.dh.fetch_components(obj)
def __init__(self, path, bucket, region, pynuxrc='~/.pynuxrc', replace=True, **kwargs): super(NuxeoStashMediaJson, self).__init__(path, bucket, region, pynuxrc, replace, **kwargs) self.dh = DeepHarvestNuxeo( self.path, self.bucket, pynuxrc=self.pynuxrc) self.mj = MediaJson() self.filename = FILENAME_FORMAT.format(self.uid) self.filepath = os.path.join(self.tmp_dir, self.filename) self._update_report('filename', self.filename) self._update_report('filepath', self.filepath)
def main(collection_ids, rq_queue='dh-q', config=None, pynuxrc=None, replace=False, timeout=JOB_TIMEOUT, log_handler=None): ''' Queue a deep harvest of a nuxeo object on a worker''' if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() log = logbook.Logger('QDH') for cid in [x for x in collection_ids.split(';')]: url_api = ''.join( ('https://registry.cdlib.org/api/v1/collection/', cid, '/')) coll = Collection(url_api) dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc) for obj in dh.fetch_objects(): log.info('Queueing TOPLEVEL {} :-: {}'.format( obj['uid'], obj['path'])) # deep harvest top level object queue_deep_harvest_path(config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=obj['path'], replace=replace, timeout=timeout) # deep harvest component sub-objects for c in dh.fetch_components(obj): log.info('Queueing {} :-: {}'.format(c['uid'], c['path'])) queue_deep_harvest_path(config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=c['path'], replace=replace, timeout=timeout) log_handler.pop_application()
def __init__(self, path, bucket, region, pynuxrc='~/.pynuxrc', replace=False, **kwargs): self.logger = logging.getLogger(__name__) self.path = path self.bucket = bucket self.pynuxrc = pynuxrc self.region = region self.replace = replace self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r')) if 'metadata' in kwargs: self.metadata = kwargs['metadata'] self.logger.info("got metadata from kwargs") else: self.metadata = self.nx.get_metadata(path=self.path) self.logger.info("got metadata via pynux utils") self.uid = self.metadata['uid'] self.logger.info("initialized NuxeoStashRef with path {}".format( self.path.encode('ascii', 'replace'))) self.dh = DeepHarvestNuxeo(self.path, uid=self.uid) self.calisphere_type = self.dh.get_calisphere_object_type( self.metadata['type']) self.tmp_dir = tempfile.mkdtemp(dir='/tmp') # FIXME put in conf self.report = {} self._update_report('uid', self.uid) self._update_report('path', self.path) self._update_report('bucket', self.bucket) self._update_report('replace', self.replace) self._update_report('pynuxrc', self.pynuxrc) self._update_report('calisphere_type', self.calisphere_type)
def main(argv=None): parser = argparse.ArgumentParser( description= 'Print info on objects missing from couchdb for Nuxeo collection') parser.add_argument('id', help='Collection registry ID') parser.add_argument('--pynuxrc', default='~/.pynuxrc', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.id print "Registry ID: {}".format(registry_id) nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) print "Nuxeo path: {}".format(nxpath) # get couchdb data view = "https://harvest-stg.cdlib.org/couchdb/ucldc/_design/all_provider_docs/_view/by_provider_name?key=%22{}%22".format( registry_id) print view res = requests.get(view, verify=False) # FIXME we want to verify res.raise_for_status() couchdata = json.loads(res.content) rows = couchdata['rows'] delimiter = "{}--".format(registry_id) couch_uids = [row['id'].split(delimiter)[1] for row in rows] couch_count = len(couch_uids) print "Total rows in couchdb: {}".format(couch_count) # get nuxeo data dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() nx_count = len(objects) print "Total objects in Nuxeo: {}".format(nx_count) for obj in objects: if obj['uid'] not in couch_uids: print obj['uid'], obj['path']
def main(argv=None): parser = argparse.ArgumentParser( description= 'list objects for a given collection where nuxeo doc type is image but file type is pdf' ) parser.add_argument('registry_id', help='UCLDC Registry ID') parser.add_argument('--pynuxrc', default='~/.pynuxrc-basic', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.registry_id # get nuxeo path nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) if nxpath is None: print "No record found for registry_id: {}".format(registry_id) sys.exit() dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() object_count = len(objects) print "finished fetching objects. {} found".format(object_count) convert = Convert() counter = 0 for obj in objects: if dh.has_file(obj) and obj['type'] == u'SampleCustomPicture' and obj[ 'properties']['file:content'][ 'mime-type'] == u'application/pdf': print obj['uid'], obj['path'], obj['type'], obj['properties'][ 'file:content']['name'] counter = counter + 1 print counter
def __init__(self, url_harvest, extra_data, conf_pynux={}, **kwargs): ''' uses pynux (https://github.com/ucldc/pynux) to grab objects from the Nuxeo API api url is set from url_harvest, overriding pynuxrc config and passed in conf. the pynux config file should have user & password and X-NXDocumemtProperties values filled in. ''' super(NuxeoFetcher, self).__init__(url_harvest, extra_data, **kwargs) self._url = url_harvest self._path = extra_data self._nx = pynux.utils.Nuxeo(conf=conf_pynux) self._nx.conf['api'] = self._url self._structmap_bucket = STRUCTMAP_S3_BUCKET # get harvestable child objects conf_pynux['api'] = self._url self._dh = DeepHarvestNuxeo(self._path, '', conf_pynux=conf_pynux) self._children = iter(self._dh.fetch_objects())
def main(argv=None): ''' stash Nuxeo files of type 'file', 'audio', or 'video' for a collection ''' parser = argparse.ArgumentParser( description='For Nuxeo collection, stash files (pdf, txt, etc) in S3.') parser.add_argument('path', help="Nuxeo document path to collection") parser.add_argument( '--bucket', default='ucldc-nuxeo-ref-media', help="S3 bucket name") parser.add_argument('--region', default='us-west-2', help="aws region") parser.add_argument( '--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux") parser.add_argument( '--replace', action="store_true", help="replace file on s3 if it already exists") if argv is None: argv = parser.parse_args() collection = argv.path.split('/')[-1] # logging logfile = 'logs/{}.log'.format(collection) print "LOG:\t{}".format(logfile) logging.basicConfig( filename=logfile, level=logging.INFO, format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger(__name__) dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc) report = {} objects = dh.fetch_objects() for obj in objects: nxstash = NuxeoStashFile(obj['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() for c in dh.fetch_components(obj): nxstash = NuxeoStashFile(c['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() # output report to json file reportfile = "reports/{}.json".format(collection) with open(reportfile, 'w') as f: json.dump(report, f, sort_keys=True, indent=4) # parse report to give basic stats report = json.load(open(reportfile)) print "REPORT:\t{}".format(reportfile) print "SUMMARY:" print "processed:\t{}".format(len(report)) not_file = len([ key for key, value in report.iteritems() if not value['calisphere_type'] in VALID_CALISPHERE_TYPES ]) print "not type `file`, `audio` or `video`:\t{}".format(not_file) already_stashed = len([ key for key, value in report.iteritems() if 'already_s3_stashed' in value.keys() and value['already_s3_stashed'] ]) print "already stashed:\t{}".format(already_stashed) stashed = len( [key for key, value in report.iteritems() if value['stashed']]) print "(re)stashed:\t{}".format(stashed) print "\nDone."
def main(argv=None): parser = argparse.ArgumentParser( description= 'Create ATOM feed for a given Nuxeo folder for Merritt harvesting') parser.add_argument("collection", help="UCLDC Registry Collection ID") parser.add_argument("--pynuxrc", help="rc file for use by pynux") if argv is None: argv = parser.parse_args() collection_id = argv.collection if argv.pynuxrc: ma = MerrittAtom(collection_id, argv.pynuxrc) else: ma = MerrittAtom(collection_id) print "atom_file: {}".format(ma.atom_file) print "ma.path: {}".format(ma.path) if argv.pynuxrc: dh = DeepHarvestNuxeo(ma.path, '', pynuxrc=argv.pynuxrc) else: dh = DeepHarvestNuxeo(ma.path, '') print "Nuxeo path: {}".format(ma.path) print "Fetching Nuxeo docs. This could take a while if collection is large..." documents = dh.fetch_objects() # create root root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP) # add entries for document in documents: nxid = document['uid'] print "working on document: {} {}".format(nxid, document['path']) # parent entry = ma._construct_entry(nxid, True) print "inserting entry for parent object {} {}".format( nxid, document['path']) root.insert(0, entry) # children component_entries = [ ma._construct_entry(c['uid'], False) for c in dh.fetch_components(document) ] for ce in component_entries: print "inserting entry for component: {} {}".format( nxid, document['path']) root.insert(0, ce) # add header info print "Adding header info to xml tree" ma._add_merritt_id(root, ma.merritt_id) ma._add_paging_info(root) ma._add_collection_alt_link(root, ma.path) ma._add_atom_elements(root) ma._add_feed_updated(root, ma.last_update) ma._write_feed(root) print "Feed written to file: {}".format(ma.atom_file) ma._s3_stash() print "Feed stashed on s3: {}".format(ma.s3_url)
def main(argv=None): ''' stash Nuxeo image files on s3 ''' parser = argparse.ArgumentParser( description='For Nuxeo collection, create jp2 versions of image ' 'files and stash in S3.') parser.add_argument('path', help="Nuxeo document path to collection") parser.add_argument('--bucket', default='ucldc-private-files/jp2000', help="S3 bucket name") parser.add_argument('--region', default='us-west-2', help='AWS region') parser.add_argument('--replace', action="store_true", help="replace file on s3 if it already exists") parser.add_argument('--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux") if argv is None: argv = parser.parse_args() collection = argv.path.split('/')[-1] # logging logfile = 'logs/{}.log'.format(collection) print "LOG:\t{}".format(logfile) logging.basicConfig( filename=logfile, level=logging.INFO, format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger(__name__) dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc) report = {} objects = dh.fetch_objects() for obj in objects: nxstash = NuxeoStashImage(obj['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() for c in dh.fetch_components(obj): nxstash = NuxeoStashImage(c['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() # output report to json file reportfile = "reports/{}.json".format(collection) with open(reportfile, 'w') as f: json.dump(report, f, sort_keys=True, indent=4) # parse report to give basic stats report = json.load(open(reportfile)) print "REPORT:\t{}".format(reportfile) print "SUMMARY:" print "processed:\t{}".format(len(report)) not_image = len([ key for key, value in report.iteritems() if not value['is_image']['is_image'] ]) print "not image:\t{}".format(not_image) unrecognized = len([ key for key, value in report.iteritems() if not value['precheck']['pass'] ]) print "not convertible:\t{}".format(unrecognized) converted = len( [key for key, value in report.iteritems() if value['converted']]) already_stashed = len([ key for key, value in report.iteritems() if 'already_s3_stashed' in value.keys() and value['already_s3_stashed'] ]) print "converted:\t{}".format(converted) stashed = len( [key for key, value in report.iteritems() if value['stashed']]) print "stashed:\t{}".format(stashed) print "\nDone."