def __init__(self, collection_id, pynuxrc=''): if pynuxrc: self.nx = utils.Nuxeo(rcfile=open(pynuxrc,'r')) elif not(pynuxrc) and os.path.isfile(expanduser('~/.pynuxrc')): self.nx = utils.Nuxeo(rcfile=open(expanduser('~/.pynuxrc'),'r')) self.collection_id = collection_id self.path = self._get_nuxeo_path() self.merritt_id = self.get_merritt_id(self.path) self.atom_file = self._get_filename(self.collection_id) if not self.atom_file: raise ValueError("Could not create filename for ATOM feed based on collection id: {}".format(self.collection_id))
def main(argv=None): parser = argparse.ArgumentParser(description='check for existence of jp2 file on s3 for given nuxeo path') parser.add_argument('path', help="Nuxeo document path") parser.add_argument('bucket', help="S3 bucket name") parser.add_argument('--pynuxrc', default='~/.pynux-prod', help="rc file for use by pynux") utils.get_common_options(parser) if argv is None: argv = parser.parse_args() nuxeo_path = argv.path bucketpath = argv.bucket nx = utils.Nuxeo(rcfile=argv.pynuxrc, loglevel=argv.loglevel.upper()) # just for simple objects for now objects = nx.children(argv.path) print "\nFound objects at {}.\nChecking S3 bucket {} for existence of corresponding files.\nThis could take a while...".format(nuxeo_path, bucketpath) i = 0 for obj in objects: nuxeo_id = nx.get_uid(obj['path']) check_object_on_s3(nuxeo_id, bucketpath) i = i + 1 print "Done. Checked {} objects".format(i)
def google_object(filepath, url): import gspread from oauth2client.service_account import ServiceAccountCredentials obj = object_level(filepath) nx = utils.Nuxeo() scope = [ 'https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive' ] creds = ServiceAccountCredentials.from_json_keyfile_name( 'client_secret.json', scope) client = gspread.authorize(creds) with open("temp.csv", "wb") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=obj['fieldnames']) writer.writeheader() for row in obj['data']: writer.writerow(row) with open("temp.csv", encoding="utf8") as f: s = f.read() + '\n' sheet_id = client.open_by_url(url).id client.import_csv(sheet_id, s) client.open_by_key(sheet_id).sheet1.update_title( "nuxeo_object_%s" % nx.get_metadata(path=filepath)['properties']['dc:title']) os.remove("temp.csv")
def main(argv=None): parser = argparse.ArgumentParser(description='check for existence of jp2 file on s3 for given nuxeo path') parser.add_argument('path', help="Nuxeo document path") utils.get_common_options(parser) if argv is None: argv = parser.parse_args() nuxeo_path = argv.path print "\nnuxeo_path:", nuxeo_path # get the Nuxeo ID nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) nuxeo_id = nx.get_uid(nuxeo_path) print "nuxeo_id:", nuxeo_id download_url = get_download_url(nuxeo_id, nuxeo_path, nx) print download_url, '\n' filename = os.path.basename(nuxeo_path) filepath = os.path.join(os.getcwd(), filename) download_nuxeo_file(download_url, filepath, nx) print "\nDone\n"
def main(argv=None): parser = argparse.ArgumentParser( description='extent stats via Nuxeo REST API') utils.get_common_options(parser) if argv is None: argv = parser.parse_args() nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) documents = nx.nxql( 'select * from Document where ecm:path startswith "/asset-library/UCM"' ) duplicates = defaultdict(list) row = 0 for document in documents: for blob in blob_from_doc(document): if blob: duplicates[blob['digest']].append( (blob['uid'], u'{0}#{1}'.format(blob['path'], blob['xpath']).encode('utf-8'))) if row % 25000 == 0: print '{0} blobs checked'.format(row) row = row + 1 duplicates = {k: v for k, v in duplicates.items() if len(v) > 1} # http://stackoverflow.com/a/8425075 pp(duplicates) print(len(duplicates))
def object_level(filepath): nx = utils.Nuxeo() data = [] for n in nx.children(filepath): data2 = {} get_title(data2, n) get_filepath(data2, n) get_type(data2, n, all_headers) get_alt_title(data2, n, all_headers) get_identifier(data2, n, all_headers) get_local_identifier(data2, n, all_headers) get_campus_unit(data2, n, all_headers) get_date(data2, n, all_headers) get_publication(data2, n, all_headers) get_creator(data2, n, all_headers) get_contributor(data2, n, all_headers) get_format(data2, n, all_headers) get_description(data2, n, all_headers) get_extent(data2, n, all_headers) get_language(data2, n, all_headers) get_temporal_coverage(data2, n, all_headers) get_transcription(data2, n, all_headers) get_access_restrictions(data2, n, all_headers) get_rights_statement(data2, n, all_headers) get_rights_status(data2, n, all_headers) get_copyright_holder(data2, n, all_headers) get_copyright_info(data2, n, all_headers) get_collection(data2, n, all_headers) get_related_resource(data2, n, all_headers) get_source(data2, n, all_headers) get_subject_name(data2, n, all_headers) get_place(data2, n, all_headers) get_subject_topic(data2, n, all_headers) get_form_genre(data2, n, all_headers) get_provenance(data2, n, all_headers) get_physical_location(data2, n, all_headers) data.append(data2) fieldnames = [ 'File path', 'Title', 'Type' ] #ensures that File path, Title and Type are the first three rows for data2 in data: for key, value in data2.items(): if key not in fieldnames: fieldnames.append(key) return { 'fieldnames': fieldnames, 'data': data, 'filename': "nuxeo_object_%s.tsv" % nx.get_metadata(path=filepath)['properties']['dc:title'] }
def get_existing_data(self, filepath, metadata_path): if self.blankout == True: return [] else: if self.data == None: nx = utils.Nuxeo() self.data = nx.get_metadata(path=filepath) return self.data['properties']['ucldc_schema:{}'.format( metadata_path)]
def main(argv=None): parser = argparse.ArgumentParser(description='convert an object to jp2') parser.add_argument('path', help="Nuxeo document path") utils.get_common_options(parser) if argv is None: argv = parser.parse_args() print argv.path nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())
def main(argv): parser = argparse.ArgumentParser() parser.add_argument( "--datafile", type=str, required=True, help="tab-delimited spreadsheet input file -- required") parser.add_argument('-d', '--dry-run', action='store_true', help='dry run') parser.add_argument('--blankout', action='store_true', help='blank out all fields not set in sheet') utils.get_common_options(parser) args = parser.parse_args() try: assert os.path.isfile(args.datafile) except AssertionError: print("Not a file: ", args.datafile) sys.exit(2) csv_data_file = args.datafile print(csv_data_file) print(args.rcfile) print(args.loglevel) nx = utils.Nuxeo(rcfile=args.rcfile, loglevel=args.loglevel.upper()) nuxeo_limit = 24 # get and instance of the Csv2Dict class which must be initialized # with the name of an input data (csv) file csv2dict = Csv2Dict(csv_data_file, blankout=args.blankout) if csv2dict.status != 0: print('The Csv2Dict constructor reported and error (%d).' % csv2dict.status) sys.exit(csv2dict.status) process_rows(csv2dict) for n in range(csv2dict.get_meta_dict_length()): print("Loading payload %d" % n) payload = csv2dict.get_meta_dict(n) print(payload) print(payload['path']) if not args.dry_run: uid = nx.get_uid(payload['path']) print("Returned UID: %d) %s." % (n, uid)) nx.update_nuxeo_properties(payload, path=payload['path'])
def __init__(self, path, s3_bucket_mediajson='', **kwargs): # get configuration and initialize pynux.utils.Nuxeo self.nx = None if 'pynuxrc' in kwargs: pynuxrc = kwargs['pynuxrc'] self.nx = utils.Nuxeo(rcfile=open(expanduser(pynuxrc), 'r')) elif 'conf_pynux' in kwargs: conf_pynux = kwargs['conf_pynux'] self.nx = utils.Nuxeo(conf=conf_pynux) else: self.nx = utils.Nuxeo(conf={}) self.path = path if 'uid' in kwargs: self.uid = kwargs['uid'] else: self.uid = self.nx.get_uid(self.path) self.s3_bucket_mediajson = s3_bucket_mediajson
def main(argv=None): parser = argparse.ArgumentParser(description='nuxeo metadata via REST API') parser.add_argument('path', nargs=1, help='nuxeo document path', type=utf8_arg) parser.add_argument( '--outdir', help="directory to hold application/json+nxentity .json files", type=utf8_arg) rstyle = parser.add_mutually_exclusive_group(required=False) rstyle.add_argument('--recursive-folders', help='recursively list project folders/Organzation', action='store_true') rstyle.add_argument('--recursive-objects', help='recursively list objects', action='store_true') show = parser.add_mutually_exclusive_group(required=False) show.add_argument('--show-only-uid', action='store_true') show.add_argument('--show-only-path', action='store_true') show.add_argument('--show-custom-function') utils.get_common_options(parser) if argv is None: argv = parser.parse_args() nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) if argv.recursive_folders: documents = nx.recursive_project_folders(argv.path[0]) elif argv.recursive_objects: documents = nx.recursive_objects(argv.path[0]) else: documents = itertools.chain( nx.nxql(u'select * from Document where ecm:path="{}"'.format( argv.path[0])), nx.children(argv.path[0])) if argv.outdir: # Expand user- and relative-paths outdir = os.path.abspath(os.path.expanduser(argv.outdir)) nx.copy_metadata_to_local(documents, outdir) elif argv.show_only_path is True: for document in documents: print(document['path']) elif argv.show_only_uid is True: for document in documents: print(document['uid']) elif argv.show_custom_function: mapper = importlib.import_module(argv.show_custom_function) mapper.nuxeo_mapper(documents, nx) else: nx.print_document_summary(documents)
def main(argv=None): parser = argparse.ArgumentParser( description='Print nuxeo json metadata for object.') parser.add_argument('path', help="Nuxeo document path") utils.get_common_options(parser) if argv is None: argv = parser.parse_args() path = argv.path nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) uid = nx.get_uid(path) metadata = nx.get_metadata(uid=uid)
def main(argv=None): parser = argparse.ArgumentParser( description='Print nuxeo path for given uid.') parser.add_argument('uid', help="Nuxeo uid") utils.get_common_options(parser) if argv is None: argv = parser.parse_args() uid = argv.uid nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) metadata = nx.get_metadata(uid=uid) path = metadata['path'] print path, uid
def main(argv=None): parser = argparse.ArgumentParser( description='print info for items in collection where media.json ' 'file is missing.' ) parser.add_argument('path', help="Nuxeo document path for collection") parser.add_argument('bucket', help="S3 bucket name") utils.get_common_options(parser) if argv is None: argv = parser.parse_args() nuxeo_path = argv.path bucketpath = argv.bucket print "collection nuxeo_path:", nuxeo_path # get the Nuxeo ID for the collection nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) nuxeo_id = nx.get_uid(nuxeo_path) print "collection nuxeo_id:", nuxeo_id # connect to S3 conn = connect_s3(calling_format=OrdinaryCallingFormat()) bucketpath = bucketpath.strip("/") bucketbase = bucketpath.split("/")[0] print "bucketpath:", bucketpath print "bucketbase:", bucketbase try: bucket = conn.get_bucket(bucketbase) except boto.exception.S3ResponseError: print "bucket doesn't exist on S3:", bucketbase items = nx.children(nuxeo_path) for item in items: obj_key = "{0}-media.json".format(item['uid']) s3_url = "s3://{0}/{1}".format(bucketpath, obj_key) #print "s3_url:", s3_url parts = urlparse.urlsplit(s3_url) #print "obj_key", obj_key #print "s3_url", s3_url if not (bucket.get_key(parts.path)): print "object doesn't exist on S3:", parts.path '''
def main(argv=None): parser = argparse.ArgumentParser( description='get media.json file for given nuxeo path') parser.add_argument('path', help="Nuxeo document path") parser.add_argument('bucket', help="S3 bucket name") utils.get_common_options(parser) if argv is None: argv = parser.parse_args() nuxeo_path = argv.path bucketpath = argv.bucket print "nuxeo_path:", nuxeo_path # get the Nuxeo ID nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) nuxeo_id = nx.get_uid(nuxeo_path) print "nuxeo_id:", nuxeo_id # see if a media.json file exists on S3 for this object conn = connect_s3(calling_format=OrdinaryCallingFormat()) bucketpath = bucketpath.strip("/") bucketbase = bucketpath.split("/")[0] obj_key = "{0}-media.json".format(nuxeo_id) s3_url = "s3://{0}/{1}".format(bucketpath, obj_key) print "s3_url:", s3_url parts = urlparse.urlsplit(s3_url) print "bucketpath:", bucketpath print "bucketbase:", bucketbase try: bucket = conn.get_bucket(bucketbase) except boto.exception.S3ResponseError: print "bucket doesn't exist on S3:", bucketbase if not (bucket.get_key(parts.path)): print "object doesn't exist on S3:", parts.path else: print "yup the object exists!:", parts.path k = Key(bucket) k.key = parts.path print "\nfile contents:" print k.get_contents_as_string()
def __init__(self, path, bucket, region, pynuxrc='~/.pynuxrc', replace=False, **kwargs): self.logger = logging.getLogger(__name__) self.path = path self.bucket = bucket self.pynuxrc = pynuxrc self.region = region self.replace = replace self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r')) if 'metadata' in kwargs: self.metadata = kwargs['metadata'] self.logger.info("got metadata from kwargs") else: self.metadata = self.nx.get_metadata(path=self.path) self.logger.info("got metadata via pynux utils") self.uid = self.metadata['uid'] self.logger.info("initialized NuxeoStashRef with path {}".format( self.path.encode('ascii', 'replace'))) self.dh = DeepHarvestNuxeo(self.path, uid=self.uid) self.calisphere_type = self.dh.get_calisphere_object_type( self.metadata['type']) self.tmp_dir = tempfile.mkdtemp(dir='/tmp') # FIXME put in conf self.report = {} self._update_report('uid', self.uid) self._update_report('path', self.path) self._update_report('bucket', self.bucket) self._update_report('replace', self.replace) self._update_report('pynuxrc', self.pynuxrc) self._update_report('calisphere_type', self.calisphere_type)
def get_metadata(nuxeo_top_path, item_level=False): """ authorize nuxeo client, iterate over documents to retrieve and map metadata rows return list of dicts (1 dict = 1 metadata row) """ nx = utils.Nuxeo() data = [] if item_level: for doc in nx.children(nuxeo_top_path): for item in nx.children(doc["path"]): metadata_row = make_metadata_row(item) data.append(metadata_row) else: # object level for doc in nx.children(nuxeo_top_path): metadata_row = make_metadata_row(doc) data.append(metadata_row) return data
def main(argv=None): parser = argparse.ArgumentParser(description='nuxeo metadata via REST API') parser.add_argument( '--outdir', help="directory to hold application/json+nxentity .json files", type=utf8_arg) utils.get_common_options(parser) if argv is None: argv = parser.parse_args() # http://demo.nuxeo.com/nuxeo/api/v1/path/default-domain/@search\?query\=SELECT%20\*%20FROM%20Document\&pageSize\=2\¤tPageIndex\=1 | jq . # todo; add these defaults as parameters as well as env nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) documents = nx.all() if argv.outdir: # Expand user- and relative-paths outdir = os.path.abspath(os.path.expanduser(argv.outdir)) nx.copy_metadata_to_local(documents, outdir) else: nx.print_document_summary(documents)
def main(argv=None): parser = argparse.ArgumentParser(description='nxql via REST API') parser.add_argument('nxql', nargs=1, help="nxql query", type=utf8_arg) parser.add_argument( '--outdir', help="directory to hold application/json+nxentity .json files", type=utf8_arg) utils.get_common_options(parser) if argv is None: argv = parser.parse_args() nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) documents = nx.nxql(argv.nxql[0]) if argv.outdir: # Expand user- and relative-paths outdir = os.path.abspath(os.path.expanduser(argv.outdir)) nx.copy_metadata_to_local(documents, outdir) else: nx.print_document_summary(documents)
def main(argv=None): parser = argparse.ArgumentParser( description='extent stats via Nuxeo REST API') utils.get_common_options(parser) if argv is None: argv = parser.parse_args() file_dict = {} conn = s3.connect_to_region('us-west-2', calling_format=OrdinaryCallingFormat()) bucket = conn.get_bucket('data.nuxeo.cdlib.org.oregon') for count, key in enumerate(bucket.list()): file_dict[key.name] = key.size if count % 50000 == 0: print('{0} s3 files memorized'.format(count)) nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) documents = nx.nxql('select * from Document') row = 0 for document in documents: for blob in blob_from_doc(document): if blob: s3_size = file_dict.get(blob['digest'], None) if not s3_size: print('{0} from {1} {2} not found in S3'.format( blob['digest'], blob['path'], blob['xpath'])) if file_dict.get(blob['digest'], 0) != int(blob['length']): print( '{0} from {1} {2} s3 size {3} does not match nuxeo size {3}' .format(blob['digest'], blob['path'], blob['xpath'], s3_size, blob['length'])) if row % 25000 == 0: print('{0} nuxeo blobs checked'.format(row)) row = row + 1
def main(argv=None): """main""" parser = argparse.ArgumentParser( description='nuxeo metadata via REST API, one record') parser.add_argument('file', nargs=1, help="application/json+nxentity") group = parser.add_mutually_exclusive_group() group.add_argument('--uid', help="update specific nuxeo uid") group.add_argument( '--path', help="update specific nuxeo path", type=utf8_arg) utils.get_common_options(parser) if argv is None: argv = parser.parse_args() # todo; add these defaults as parameters as well as env nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) pp(argv.file[0]) jfile = argv.file[0] uid = argv.uid path = argv.path json_data = open(jfile) data = json.load(json_data) ret = {} if uid: # use uid supplied at command line ret = nx.update_nuxeo_properties(data, uid=uid) elif path: # use path supplied at command line ret = nx.update_nuxeo_properties(data, path=path) # if no uid nor path was specified on the command line, then # prefer "path": to "uid": when importing files because the file may have # come from another machine where the uuids are different else: uid = nx.get_uid(data.get('path')) or data.get('uid') ret = nx.update_nuxeo_properties(data, uid=uid) if not ret: print("no uid found, specify --uid or --path") exit(1) pp(ret)
def setUp(self): self.nx = utils.Nuxeo({ 'api': 'http://mockme/r', }, rcfile=io.BytesIO(bytes()))
def main(argv=None): parser = argparse.ArgumentParser( description='extent stats via Nuxeo REST API') parser.add_argument('path', nargs=1, help="root path") parser.add_argument( 'outdir', nargs=1, ) parser.add_argument('--no-s3-check', dest='s3_check', action='store_false') utils.get_common_options(parser) if argv is None: argv = parser.parse_args() # look up all the files in S3, so we can double check that all # the files exist as we loop through Nuxeo file_check = None s3_bytes = s3_count = 0 if argv.s3_check: from boto import s3 from boto.s3.connection import OrdinaryCallingFormat file_check = {} conn = s3.connect_to_region('us-west-2', calling_format=OrdinaryCallingFormat()) bucket = conn.get_bucket('data.nuxeo.cdlib.org.oregon') for count, key in enumerate(bucket.list()): file_check[key.name] = key.size if count % 50000 == 0: print('{0} s3 files memorized'.format(count), file=sys.stderr) s3_bytes = s3_bytes + key.size s3_count = len(file_check) nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) campuses = nx.children(argv.path[0]) summary_workbook = xlsxwriter.Workbook( os.path.join(argv.outdir[0], '{}-summary.xlsx'.format(today))) # cell formats header_format = summary_workbook.add_format({ 'bold': True, }) number_format = summary_workbook.add_format() number_format.set_num_format('#,##0') summary_worksheet = summary_workbook.add_worksheet('summary') # headers summary_worksheet.write(0, 1, 'deduplicated files', header_format) summary_worksheet.write(0, 2, 'deduplicated bytes', header_format) summary_worksheet.write(0, 4, 'total files', header_format) summary_worksheet.write(0, 5, 'total bytes', header_format) if argv.s3_check: summary_worksheet.write(0, 7, 'files on S3', header_format) summary_worksheet.write(0, 8, 'bytes on S3', header_format) # widths summary_worksheet.set_column( 0, 1, 10, ) summary_worksheet.set_column( 2, 2, 25, ) summary_worksheet.set_column( 3, 4, 10, ) summary_worksheet.set_column( 5, 5, 25, ) summary_worksheet.set_column( 6, 7, 10, ) summary_worksheet.set_column( 8, 8, 25, ) summary_worksheet.set_column( 9, 9, 10, ) true_count = dedup_total = total_count = running_total = 0 row = 1 for campus in campuses: basename = os.path.basename(campus['path']) documents = nx.nxql( 'select * from Document where ecm:path startswith"{0}"'.format( campus['path'])) (this_count, this_total, dedup_count, dedup_bytes) = forCampus(documents, basename, file_check, argv.outdir[0]) summary_worksheet.write(row, 0, basename) summary_worksheet.write(row, 1, dedup_count, number_format) summary_worksheet.write(row, 2, dedup_bytes, number_format) summary_worksheet.write(row, 3, sizeof_fmt(dedup_bytes)) summary_worksheet.write(row, 4, this_count, number_format) summary_worksheet.write(row, 5, this_total, number_format) summary_worksheet.write(row, 6, sizeof_fmt(this_total)) total_count = total_count + this_count # number of files running_total = running_total + this_total # number of bytes true_count = true_count + dedup_count dedup_total = dedup_total + dedup_bytes # number of bytes row = row + 1 summary_worksheet.write(row, 0, '{}'.format(today)) summary_worksheet.write(row, 1, true_count, number_format) summary_worksheet.write(row, 2, dedup_total, number_format) summary_worksheet.write(row, 3, sizeof_fmt(dedup_total)) summary_worksheet.write(row, 4, total_count, number_format) summary_worksheet.write(row, 5, running_total, number_format) summary_worksheet.write(row, 6, sizeof_fmt(running_total)) if argv.s3_check: summary_worksheet.write(row, 7, s3_count, number_format) summary_worksheet.write(row, 8, s3_bytes, number_format) summary_worksheet.write(row, 9, sizeof_fmt(s3_bytes)) summary_workbook.close()
def main(argv=None): parser = argparse.ArgumentParser( description='nxidbatch mints a batch of ARKs') parser.add_argument('batchsize', nargs=1, help='size of ARK batch', type=int) ezid_group = parser.add_argument_group('minting behaviour flags') ezid_group.add_argument('--mint', '-m', action='store_true', help='mint ARKs without prompt') ezid_group.add_argument('--output', '-o', type=lambda x: is_valid_file(parser, x), required=True) conf_group = parser.add_argument_group('EZID configuration and metadata') conf_group.add_argument('--ezid-username', help='username for EZID API (overrides rcfile)', type=utf8_arg) conf_group.add_argument('--ezid-password', help='password for EZID API (overrides rc file)', type=utf8_arg) conf_group.add_argument('--shoulder', help='shoulder (overrides rcfile)', type=utf8_arg) conf_group.add_argument('--owner', help='set as _owner for EZID', type=utf8_arg) conf_group.add_argument( '--status', help= 'set as _status for EZID (default reserved, or public|unavailable)', default="reserved", type=utf8_arg) conf_group.add_argument('--publisher', help='set as dc.publisher for EZID', type=utf8_arg) utils.get_common_options(parser) if argv is None: argv = parser.parse_args() nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) # read config out of .pynuxrc file username = argv.ezid_username or nx.ezid_conf['username'] password = argv.ezid_password or nx.ezid_conf['password'] shoulder = argv.shoulder or nx.ezid_conf['shoulder'] ezid = EZID.EZIDClient( credentials=dict(username=username, password=password)) if argv.mint: output = open(argv.output, 'w') else: # https://stackoverflow.com/a/26514097/1763984 answer = raw_input( 'Mint a batch {} of {} ARKs with prefix {} with EZID? [y/n]'. format(argv.output, argv.batchsize, shoulder)) if not answer or answer[0].lower() != 'y': print('You did not indicate approval') exit(1) else: output = open(argv.output, 'w') for __ in range(argv.batchsize[0]): # mint new_ark = ezid.mint(shoulder) print(new_ark, file=output) if not (argv.mint): print('done')
def main(argv=None): parser = argparse.ArgumentParser( description= 'nxid finds top level objects in Nuxeo and syncs them up with EZID') parser.add_argument('path', nargs=1, help='nuxeo path (folder or object)', type=utf8_arg) ezid_group = parser.add_argument_group('minting behaviour flags') ezid_group.add_argument( '--mint', '-m', action='store_true', help='when an ARK is missing, mint and bind new ARK in EZID') ezid_group.add_argument( '--create', '-c', action='store_true', help='when an ARK is found in Nuxeo but not EZID, create EZID') ezid_group.add_argument( '--update', '-u', action='store_true', help='when an ARK is found in Nuxeo and EZID, update EZID') ezid_group.add_argument( '--no-noop-report', action='store_true', help='override default behaviour of reporting on noops') ezid_group.add_argument('--show-erc', action='store_true', help='show ANVL record that will be sent to EZID') conf_group = parser.add_argument_group('EZID configuration and metadata') conf_group.add_argument('--ezid-username', help='username for EZID API (overrides rcfile)', type=utf8_arg) conf_group.add_argument('--ezid-password', help='password for EZID API (overrides rc file)', type=utf8_arg) conf_group.add_argument('--shoulder', help='shoulder (overrides rcfile)', type=utf8_arg) conf_group.add_argument('--owner', help='set as _owner for EZID', type=utf8_arg) conf_group.add_argument( '--status', help='set as _status for EZID (public|reserved|unavailable)', type=utf8_arg) conf_group.add_argument('--publisher', help='set as dc.publisher for EZID', type=utf8_arg) conf_group.add_argument('--location', help='set location URL prefix for EZID', type=utf8_arg) utils.get_common_options(parser) if argv is None: argv = parser.parse_args() nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) # read config out of .pynuxrc file username = argv.ezid_username or nx.ezid_conf['username'] password = argv.ezid_password or nx.ezid_conf['password'] shoulder = argv.shoulder or nx.ezid_conf['shoulder'] ezid = EZID.EZIDClient( credentials=dict(username=username, password=password)) # query to select all parent level objects documents = nx.nxql(u''' SELECT * FROM SampleCustomPicture, CustomFile, CustomVideo, CustomAudio WHERE ecm:path STARTSWITH "{}" AND ecm:currentLifeCycleState != "deleted" AND ecm:pos is NULL'''.format(argv.path[0])) # if the user gives the full path to a document if not any( True for _ in documents): # https://stackoverflow.com/a/3114640/1763984 documents = nx.nxql(u''' SELECT * FROM SampleCustomPicture, CustomFile, CustomVideo, CustomAudio WHERE ecm:path = "{}" AND ecm:currentLifeCycleState != "deleted" AND ecm:pos is NULL'''.format(argv.path[0])) report = not (argv.no_noop_report) # main loop for item in documents: # check id for ARK ark = find_ark(item['properties']['ucldc_schema:identifier']) path = item['path'] # if there is an ARK, check for a record in EZID ezid_status = None if ark is not None: ezid_status = check_ezid(ark, ezid) ezdata = item_erc_dict( item, owner=argv.owner, # _owner status=argv.status, # _status publisher=argv.publisher, # dc.publisher location=argv.location # _target ) if argv.show_erc: print(EZID.formatAnvlFromDict(ezdata)) print('') # mint if not (ark) and not (ezid_status): if argv.mint: new_ark = ezid.mint(shoulder, ezdata) update_nuxeo(item, nx, new_ark) print('✓ mint "{}" {}'.format(path, new_ark)) elif report: print('ℹ noop mint "{}"'.format(path)) # create if ark and not (ezid_status): if argv.create: ezid.create(ark, ezdata) print('✓ create "{}" {}'.format(path, ark)) elif report: print('ℹ noop create "{}" {}'.format(path, ark)) # update if ark and ezid_status: owner = get_owner(ezid_status) if argv.update: ezid.update(ark, ezdata) print('✓ update "{}" {}'.format(path, ark)) elif report: print('ℹ noop update "{}" {} {}'.format(path, ark, owner))
def main(argv=None): parser = argparse.ArgumentParser(description='nuxeo metadata via REST API') parser.add_argument('path', nargs=1, help="nuxeo document path") utils.get_common_options(parser) if argv is None: argv = parser.parse_args() nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) documents = nx.children(argv.path[0]) # open the workbook workbook = xlsxwriter.Workbook('qa.xlsx') header_format = workbook.add_format({ 'bold': True, }) report = workbook.add_worksheet() report.set_column( 0, 0, 10, ) report.set_column( 1, 2, 40, ) report.set_column( 3, 4, 80, ) report.write(0, 0, 'nuxeo-uid', header_format) report.write(0, 1, 'ucldc_schema:localidentifier', header_format) report.write(0, 2, 'filename', header_format) report.write(0, 3, 'nuxeo-path', header_format) report.write(0, 4, 'title', header_format) # document specified on command line root_doc = nx.get_metadata(path=argv.path[0]) report.write(1, 0, root_doc['uid']) report.write(1, 3, argv.path[0]) row = 2 for document in documents: p = document['properties'] report.write(row, 0, document['uid']) report.write(row, 1, p['ucldc_schema:localidentifier'][0]) if 'file:filename' in p: report.write(row, 2, p['file:filename']) report.write(row, 3, document['path'].replace(argv.path[0], '', 1)) report.write(row, 4, document['title']) row = row + 1 # output # path|localid|title # workbook.close()
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--datafile", type=str, required=True, help="CSV data input file -- required") parser.add_argument("--rcfile", type=str, required=True, help="Pynux resource file -- required") parser.add_argument("--loglevel", type=str, required=False, default="INFO", help="Set Pynux logging level") try: args = parser.parse_args() except: print "Unable to parse input parameters..." sys.exit(2) try: assert os.path.isfile(args.datafile) except AssertionError: print "Not a file: ", args.datafile sys.exit(2) try: assert os.path.isfile(args.rcfile) except AssertionError: print "Not a file: ", args.rcfile sys.exit(2) csv_data_file = args.datafile print csv_data_file print args.rcfile print args.loglevel nx = utils.Nuxeo(rcfile=args.rcfile, loglevel=args.loglevel.upper()) nuxeo_limit = 24 # get and instance of the Csv2Dict class which must be initialized # with the name of an input data (csv) file csv2dict = Csv2Dict(csv_data_file) if csv2dict.status != 0: print 'The Csv2Dict constructor reported and error (%d).' % csv2dict.status sys.exit(csv2dict.status) process_rows(csv2dict) for n in range(csv2dict.get_meta_dict_length()): print "Loading payload %d" % n payload = csv2dict.get_meta_dict(n) print payload['path'] uid = nx.get_uid(payload['path']) print "Returned UID: %d) %s." % (n, uid) print payload print payload['path'] nx.update_nuxeo_properties(payload, path=payload['path']) csv2dict.print_meta_dicts('LOGS/latest_output.txt')
def main(argv=None): parser = argparse.ArgumentParser( description='print info for items in collection where media.json ' 'file is missing.' ) parser.add_argument('path', help="Nuxeo document path for collection") parser.add_argument('bucket', help="S3 bucket name") parser.add_argument("--pynuxrc", default='~/.pynuxrc', help="rc file for use by pynux") parser.add_argument( '--stash', action="store_true", help="create and stash missing media.json file") utils.get_common_options(parser) if argv is None: argv = parser.parse_args() nuxeo_path = argv.path bucketpath = argv.bucket pynuxrc = argv.pynuxrc stash = argv.stash print("collection nuxeo_path:", nuxeo_path) # get the Nuxeo ID for the collection nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) nuxeo_id = nx.get_uid(nuxeo_path) print("collection nuxeo_id:", nuxeo_id) # connect to S3 conn = connect_s3(calling_format=OrdinaryCallingFormat()) bucketpath = bucketpath.strip("/") bucketbase = bucketpath.split("/")[0] print("bucketpath:", bucketpath) print("bucketbase:", bucketbase) try: bucket = conn.get_bucket(bucketbase) except boto.exception.S3ResponseError: print("bucket doesn't exist on S3:", bucketbase) items = nx.children(nuxeo_path) for item in items: obj_key = "{0}-media.json".format(item['uid']) s3_url = "s3://{0}/{1}".format(bucketpath, obj_key) #print "s3_url:", s3_url parts = urlparse.urlsplit(s3_url) #print "obj_key", obj_key #print "s3_url", s3_url if item['type'] != 'Organization' and not (bucket.get_key(parts.path)): print("object doesn't exist on S3:", parts.path, item['path']) if stash: nxstash = NuxeoStashMediaJson( item['path'], MEDIA_JSON_BUCKET, MEDIA_JSON_REGION, pynuxrc, True) nxstash.nxstashref() print("stashed for item['path']") '''
def main(argv=None): parser = argparse.ArgumentParser( description="extent stats via Nuxeo REST API") parser.add_argument( "outdir", nargs=1, ) parser.add_argument("--no-s3-check", dest="s3_check", action="store_false") utils.get_common_options(parser) if argv is None: argv = parser.parse_args() os.makedirs(argv.outdir[0], exist_ok=True) # look up all the files in S3, so we can double check that all # the files exist as we loop through Nuxeo file_check = None s3_bytes = s3_count = 0 if argv.s3_check: from boto import s3 from boto.s3.connection import OrdinaryCallingFormat file_check = {} conn = s3.connect_to_region("us-west-2", calling_format=OrdinaryCallingFormat()) bucket = conn.get_bucket("data.nuxeo.cdlib.org.oregon") for count, key in enumerate(bucket.list()): file_check[key.name] = key.size if count % 50000 == 0: print("{0} s3 files memorized".format(count), file=sys.stderr) s3_bytes = s3_bytes + key.size s3_count = len(file_check) nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper()) campuses = [ "UCB", "UCD", "UCI", "UCLA", "UCM", "UCOP", "UCR", "UCSB", "UCSC", "UCSD", "UCSF", ] summary_workbook = xlsxwriter.Workbook( os.path.join(argv.outdir[0], "{}-summary.xlsx".format(today))) # cell formats header_format = summary_workbook.add_format({ "bold": True, }) number_format = summary_workbook.add_format() number_format.set_num_format("#,##0") summary_worksheet = summary_workbook.add_worksheet("summary") # headers summary_worksheet.write(0, 1, "deduplicated files", header_format) summary_worksheet.write(0, 2, "deduplicated bytes", header_format) summary_worksheet.write(0, 4, "total files", header_format) summary_worksheet.write(0, 5, "total bytes", header_format) if argv.s3_check: summary_worksheet.write(0, 7, "files on S3", header_format) summary_worksheet.write(0, 8, "bytes on S3", header_format) # widths summary_worksheet.set_column( 0, 1, 10, ) summary_worksheet.set_column( 2, 2, 25, ) summary_worksheet.set_column( 3, 4, 10, ) summary_worksheet.set_column( 5, 5, 25, ) summary_worksheet.set_column( 6, 7, 10, ) summary_worksheet.set_column( 8, 8, 25, ) summary_worksheet.set_column( 9, 9, 10, ) true_count = dedup_total = total_count = running_total = 0 row = 1 for campus in campuses: (this_count, this_total, dedup_count, dedup_bytes) = forCampus(campus, file_check, argv.outdir[0], nx) # write out this row in the sheet summary_worksheet.write(row, 0, campus) summary_worksheet.write(row, 1, dedup_count, number_format) summary_worksheet.write(row, 2, dedup_bytes, number_format) summary_worksheet.write(row, 3, sizeof_fmt(dedup_bytes)) summary_worksheet.write(row, 4, this_count, number_format) summary_worksheet.write(row, 5, this_total, number_format) summary_worksheet.write(row, 6, sizeof_fmt(this_total)) # keep track of running totals total_count = total_count + this_count # number of files running_total = running_total + this_total # number of bytes true_count = true_count + dedup_count dedup_total = dedup_total + dedup_bytes # number of bytes row = row + 1 # write totals in the summary worksheet summary_worksheet.write(row, 0, "{}".format(today)) summary_worksheet.write(row, 1, true_count, number_format) summary_worksheet.write(row, 2, dedup_total, number_format) summary_worksheet.write(row, 3, sizeof_fmt(dedup_total)) summary_worksheet.write(row, 4, total_count, number_format) summary_worksheet.write(row, 5, running_total, number_format) summary_worksheet.write(row, 6, sizeof_fmt(running_total)) if argv.s3_check: summary_worksheet.write(row, 7, s3_count, number_format) summary_worksheet.write(row, 8, s3_bytes, number_format) summary_worksheet.write(row, 9, sizeof_fmt(s3_bytes)) summary_workbook.close()