Exemplo n.º 1
0
    def __init__(self, collection_id, pynuxrc=''):
        if pynuxrc:
            self.nx = utils.Nuxeo(rcfile=open(pynuxrc,'r'))
        elif not(pynuxrc) and os.path.isfile(expanduser('~/.pynuxrc')):
            self.nx = utils.Nuxeo(rcfile=open(expanduser('~/.pynuxrc'),'r'))

        self.collection_id = collection_id
        self.path = self._get_nuxeo_path()
        self.merritt_id = self.get_merritt_id(self.path)
 
        self.atom_file = self._get_filename(self.collection_id)
        if not self.atom_file:
            raise ValueError("Could not create filename for ATOM feed based on collection id: {}".format(self.collection_id))
def main(argv=None):

    parser = argparse.ArgumentParser(description='check for existence of jp2 file on s3 for given nuxeo path')
    parser.add_argument('path', help="Nuxeo document path")
    parser.add_argument('bucket', help="S3 bucket name")
    parser.add_argument('--pynuxrc', default='~/.pynux-prod', help="rc file for use by pynux")

    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    nuxeo_path = argv.path
    bucketpath = argv.bucket

    nx = utils.Nuxeo(rcfile=argv.pynuxrc, loglevel=argv.loglevel.upper())
    # just for simple objects for now
    objects = nx.children(argv.path)
    print "\nFound objects at {}.\nChecking S3 bucket {} for existence of corresponding files.\nThis could take a while...".format(nuxeo_path, bucketpath)
    i = 0
    for obj in objects:
        nuxeo_id = nx.get_uid(obj['path'])
        check_object_on_s3(nuxeo_id, bucketpath)
        i = i + 1

    print "Done. Checked {} objects".format(i)
Exemplo n.º 3
0
def google_object(filepath, url):
    import gspread
    from oauth2client.service_account import ServiceAccountCredentials
    obj = object_level(filepath)
    nx = utils.Nuxeo()
    scope = [
        'https://spreadsheets.google.com/feeds',
        'https://www.googleapis.com/auth/drive'
    ]
    creds = ServiceAccountCredentials.from_json_keyfile_name(
        'client_secret.json', scope)
    client = gspread.authorize(creds)
    with open("temp.csv", "wb") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=obj['fieldnames'])
        writer.writeheader()
        for row in obj['data']:
            writer.writerow(row)
    with open("temp.csv", encoding="utf8") as f:
        s = f.read() + '\n'
    sheet_id = client.open_by_url(url).id
    client.import_csv(sheet_id, s)
    client.open_by_key(sheet_id).sheet1.update_title(
        "nuxeo_object_%s" %
        nx.get_metadata(path=filepath)['properties']['dc:title'])
    os.remove("temp.csv")
Exemplo n.º 4
0
def main(argv=None):

    parser = argparse.ArgumentParser(description='check for existence of jp2 file on s3 for given nuxeo path')
    parser.add_argument('path', help="Nuxeo document path")

    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    nuxeo_path = argv.path
    
    print "\nnuxeo_path:", nuxeo_path

    # get the Nuxeo ID
    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())
    nuxeo_id = nx.get_uid(nuxeo_path)
    print "nuxeo_id:", nuxeo_id

    download_url = get_download_url(nuxeo_id, nuxeo_path, nx)
    print download_url, '\n'

    filename = os.path.basename(nuxeo_path)
    filepath = os.path.join(os.getcwd(), filename)
    download_nuxeo_file(download_url, filepath, nx)

    print "\nDone\n"
Exemplo n.º 5
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description='extent stats via Nuxeo REST API')
    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())

    documents = nx.nxql(
        'select * from Document where ecm:path startswith "/asset-library/UCM"'
    )

    duplicates = defaultdict(list)

    row = 0
    for document in documents:
        for blob in blob_from_doc(document):
            if blob:
                duplicates[blob['digest']].append(
                    (blob['uid'],
                     u'{0}#{1}'.format(blob['path'],
                                       blob['xpath']).encode('utf-8')))
        if row % 25000 == 0:
            print '{0} blobs checked'.format(row)
        row = row + 1
    duplicates = {k: v
                  for k, v in duplicates.items()
                  if len(v) > 1}  # http://stackoverflow.com/a/8425075
    pp(duplicates)
    print(len(duplicates))
Exemplo n.º 6
0
def object_level(filepath):
    nx = utils.Nuxeo()
    data = []
    for n in nx.children(filepath):
        data2 = {}

        get_title(data2, n)
        get_filepath(data2, n)
        get_type(data2, n, all_headers)
        get_alt_title(data2, n, all_headers)
        get_identifier(data2, n, all_headers)
        get_local_identifier(data2, n, all_headers)
        get_campus_unit(data2, n, all_headers)
        get_date(data2, n, all_headers)
        get_publication(data2, n, all_headers)
        get_creator(data2, n, all_headers)
        get_contributor(data2, n, all_headers)
        get_format(data2, n, all_headers)
        get_description(data2, n, all_headers)
        get_extent(data2, n, all_headers)
        get_language(data2, n, all_headers)
        get_temporal_coverage(data2, n, all_headers)
        get_transcription(data2, n, all_headers)
        get_access_restrictions(data2, n, all_headers)
        get_rights_statement(data2, n, all_headers)
        get_rights_status(data2, n, all_headers)
        get_copyright_holder(data2, n, all_headers)
        get_copyright_info(data2, n, all_headers)
        get_collection(data2, n, all_headers)
        get_related_resource(data2, n, all_headers)
        get_source(data2, n, all_headers)
        get_subject_name(data2, n, all_headers)
        get_place(data2, n, all_headers)
        get_subject_topic(data2, n, all_headers)
        get_form_genre(data2, n, all_headers)
        get_provenance(data2, n, all_headers)
        get_physical_location(data2, n, all_headers)

        data.append(data2)

    fieldnames = [
        'File path', 'Title', 'Type'
    ]  #ensures that File path, Title and Type are the first three rows
    for data2 in data:
        for key, value in data2.items():
            if key not in fieldnames:
                fieldnames.append(key)

    return {
        'fieldnames':
        fieldnames,
        'data':
        data,
        'filename':
        "nuxeo_object_%s.tsv" %
        nx.get_metadata(path=filepath)['properties']['dc:title']
    }
Exemplo n.º 7
0
 def get_existing_data(self, filepath, metadata_path):
     if self.blankout == True:
         return []
     else:
         if self.data == None:
             nx = utils.Nuxeo()
             self.data = nx.get_metadata(path=filepath)
         return self.data['properties']['ucldc_schema:{}'.format(
             metadata_path)]
Exemplo n.º 8
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='convert an object to jp2')
    parser.add_argument('path', help="Nuxeo document path")

    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    print argv.path
    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())
Exemplo n.º 9
0
def main(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--datafile",
        type=str,
        required=True,
        help="tab-delimited spreadsheet input file -- required")

    parser.add_argument('-d', '--dry-run', action='store_true', help='dry run')

    parser.add_argument('--blankout',
                        action='store_true',
                        help='blank out all fields not set in sheet')

    utils.get_common_options(parser)

    args = parser.parse_args()

    try:
        assert os.path.isfile(args.datafile)
    except AssertionError:
        print("Not a file: ", args.datafile)
        sys.exit(2)

    csv_data_file = args.datafile
    print(csv_data_file)
    print(args.rcfile)
    print(args.loglevel)

    nx = utils.Nuxeo(rcfile=args.rcfile, loglevel=args.loglevel.upper())
    nuxeo_limit = 24

    # get and instance of the Csv2Dict class which must be initialized
    # with the name of an input data (csv) file

    csv2dict = Csv2Dict(csv_data_file, blankout=args.blankout)

    if csv2dict.status != 0:
        print('The Csv2Dict constructor reported and error (%d).' %
              csv2dict.status)
        sys.exit(csv2dict.status)

    process_rows(csv2dict)

    for n in range(csv2dict.get_meta_dict_length()):
        print("Loading payload %d" % n)
        payload = csv2dict.get_meta_dict(n)
        print(payload)
        print(payload['path'])
        if not args.dry_run:
            uid = nx.get_uid(payload['path'])
            print("Returned UID: %d) %s." % (n, uid))
            nx.update_nuxeo_properties(payload, path=payload['path'])
Exemplo n.º 10
0
    def __init__(self, path, s3_bucket_mediajson='', **kwargs):
        # get configuration and initialize pynux.utils.Nuxeo
        self.nx = None
        if 'pynuxrc' in kwargs:
            pynuxrc = kwargs['pynuxrc']
            self.nx = utils.Nuxeo(rcfile=open(expanduser(pynuxrc), 'r'))
        elif 'conf_pynux' in kwargs:
            conf_pynux = kwargs['conf_pynux']
            self.nx = utils.Nuxeo(conf=conf_pynux)
        else:
            self.nx = utils.Nuxeo(conf={})

        self.path = path

        if 'uid' in kwargs:
            self.uid = kwargs['uid']
        else:
            self.uid = self.nx.get_uid(self.path)

        self.s3_bucket_mediajson = s3_bucket_mediajson
Exemplo n.º 11
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='nuxeo metadata via REST API')
    parser.add_argument('path',
                        nargs=1,
                        help='nuxeo document path',
                        type=utf8_arg)
    parser.add_argument(
        '--outdir',
        help="directory to hold application/json+nxentity .json files",
        type=utf8_arg)
    rstyle = parser.add_mutually_exclusive_group(required=False)
    rstyle.add_argument('--recursive-folders',
                        help='recursively list project folders/Organzation',
                        action='store_true')
    rstyle.add_argument('--recursive-objects',
                        help='recursively list objects',
                        action='store_true')
    show = parser.add_mutually_exclusive_group(required=False)
    show.add_argument('--show-only-uid', action='store_true')
    show.add_argument('--show-only-path', action='store_true')
    show.add_argument('--show-custom-function')
    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())

    if argv.recursive_folders:
        documents = nx.recursive_project_folders(argv.path[0])
    elif argv.recursive_objects:
        documents = nx.recursive_objects(argv.path[0])
    else:
        documents = itertools.chain(
            nx.nxql(u'select * from Document where ecm:path="{}"'.format(
                argv.path[0])), nx.children(argv.path[0]))

    if argv.outdir:
        # Expand user- and relative-paths
        outdir = os.path.abspath(os.path.expanduser(argv.outdir))
        nx.copy_metadata_to_local(documents, outdir)
    elif argv.show_only_path is True:
        for document in documents:
            print(document['path'])
    elif argv.show_only_uid is True:
        for document in documents:
            print(document['uid'])
    elif argv.show_custom_function:
        mapper = importlib.import_module(argv.show_custom_function)
        mapper.nuxeo_mapper(documents, nx)
    else:
        nx.print_document_summary(documents)
Exemplo n.º 12
0
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='Print nuxeo json metadata for object.')
    parser.add_argument('path', help="Nuxeo document path")

    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    path = argv.path

    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())
    uid = nx.get_uid(path)
    metadata = nx.get_metadata(uid=uid)
Exemplo n.º 13
0
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='Print nuxeo path for given uid.')
    parser.add_argument('uid', help="Nuxeo uid")

    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    uid = argv.uid

    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())
    metadata = nx.get_metadata(uid=uid)
    path = metadata['path']
    print path, uid
Exemplo n.º 14
0
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='print info for items in collection where media.json '
                    'file is missing.'
    )
    parser.add_argument('path', help="Nuxeo document path for collection")
    parser.add_argument('bucket', help="S3 bucket name")

    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    nuxeo_path = argv.path
    bucketpath = argv.bucket

    print "collection nuxeo_path:", nuxeo_path

    # get the Nuxeo ID for the collection
    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())
    nuxeo_id = nx.get_uid(nuxeo_path)
    print "collection nuxeo_id:", nuxeo_id

    # connect to S3
    conn = connect_s3(calling_format=OrdinaryCallingFormat())
    bucketpath = bucketpath.strip("/")
    bucketbase = bucketpath.split("/")[0]
    print "bucketpath:", bucketpath
    print "bucketbase:", bucketbase

    try:
        bucket = conn.get_bucket(bucketbase)
    except boto.exception.S3ResponseError:
        print "bucket doesn't exist on S3:", bucketbase

    items = nx.children(nuxeo_path)
    for item in items:
        obj_key = "{0}-media.json".format(item['uid'])
        s3_url = "s3://{0}/{1}".format(bucketpath, obj_key)
        #print "s3_url:", s3_url
        parts = urlparse.urlsplit(s3_url)
        #print "obj_key", obj_key
        #print "s3_url", s3_url

        if not (bucket.get_key(parts.path)):
            print "object doesn't exist on S3:", parts.path
        '''
Exemplo n.º 15
0
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='get media.json file for given nuxeo path')
    parser.add_argument('path', help="Nuxeo document path")
    parser.add_argument('bucket', help="S3 bucket name")

    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    nuxeo_path = argv.path
    bucketpath = argv.bucket

    print "nuxeo_path:", nuxeo_path

    # get the Nuxeo ID
    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())
    nuxeo_id = nx.get_uid(nuxeo_path)
    print "nuxeo_id:", nuxeo_id

    # see if a media.json file exists on S3 for this object
    conn = connect_s3(calling_format=OrdinaryCallingFormat())
    bucketpath = bucketpath.strip("/")
    bucketbase = bucketpath.split("/")[0]
    obj_key = "{0}-media.json".format(nuxeo_id)
    s3_url = "s3://{0}/{1}".format(bucketpath, obj_key)
    print "s3_url:", s3_url
    parts = urlparse.urlsplit(s3_url)
    print "bucketpath:", bucketpath
    print "bucketbase:", bucketbase

    try:
        bucket = conn.get_bucket(bucketbase)
    except boto.exception.S3ResponseError:
        print "bucket doesn't exist on S3:", bucketbase

    if not (bucket.get_key(parts.path)):
        print "object doesn't exist on S3:", parts.path
    else:
        print "yup the object exists!:", parts.path
        k = Key(bucket)
        k.key = parts.path
        print "\nfile contents:"
        print k.get_contents_as_string()
Exemplo n.º 16
0
    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=False,
                 **kwargs):

        self.logger = logging.getLogger(__name__)

        self.path = path
        self.bucket = bucket
        self.pynuxrc = pynuxrc
        self.region = region
        self.replace = replace

        self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r'))

        if 'metadata' in kwargs:
            self.metadata = kwargs['metadata']
            self.logger.info("got metadata from kwargs")
        else:
            self.metadata = self.nx.get_metadata(path=self.path)
            self.logger.info("got metadata via pynux utils")

        self.uid = self.metadata['uid']

        self.logger.info("initialized NuxeoStashRef with path {}".format(
            self.path.encode('ascii', 'replace')))

        self.dh = DeepHarvestNuxeo(self.path, uid=self.uid)
        self.calisphere_type = self.dh.get_calisphere_object_type(
            self.metadata['type'])
        self.tmp_dir = tempfile.mkdtemp(dir='/tmp')  # FIXME put in conf

        self.report = {}
        self._update_report('uid', self.uid)
        self._update_report('path', self.path)
        self._update_report('bucket', self.bucket)
        self._update_report('replace', self.replace)
        self._update_report('pynuxrc', self.pynuxrc)
        self._update_report('calisphere_type', self.calisphere_type)
Exemplo n.º 17
0
def get_metadata(nuxeo_top_path, item_level=False):
    """ authorize nuxeo client,
        iterate over documents to retrieve and map metadata rows
        return list of dicts (1 dict = 1 metadata row)
    """

    nx = utils.Nuxeo()

    data = []
    if item_level:
        for doc in nx.children(nuxeo_top_path):
            for item in nx.children(doc["path"]):
                metadata_row = make_metadata_row(item)
                data.append(metadata_row)
    else:  # object level
        for doc in nx.children(nuxeo_top_path):
            metadata_row = make_metadata_row(doc)
            data.append(metadata_row)

    return data
Exemplo n.º 18
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='nuxeo metadata via REST API')
    parser.add_argument(
        '--outdir',
        help="directory to hold application/json+nxentity .json files",
        type=utf8_arg)
    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    # http://demo.nuxeo.com/nuxeo/api/v1/path/default-domain/@search\?query\=SELECT%20\*%20FROM%20Document\&pageSize\=2\&currentPageIndex\=1 | jq .
    # todo; add these defaults as parameters as well as env
    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())
    documents = nx.all()

    if argv.outdir:
        # Expand user- and relative-paths
        outdir = os.path.abspath(os.path.expanduser(argv.outdir))
        nx.copy_metadata_to_local(documents, outdir)
    else:
        nx.print_document_summary(documents)
Exemplo n.º 19
0
def main(argv=None):

    parser = argparse.ArgumentParser(description='nxql via REST API')
    parser.add_argument('nxql', nargs=1, help="nxql query", type=utf8_arg)
    parser.add_argument(
        '--outdir',
        help="directory to hold application/json+nxentity .json files",
        type=utf8_arg)
    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())

    documents = nx.nxql(argv.nxql[0])

    if argv.outdir:
        # Expand user- and relative-paths
        outdir = os.path.abspath(os.path.expanduser(argv.outdir))
        nx.copy_metadata_to_local(documents, outdir)
    else:
        nx.print_document_summary(documents)
Exemplo n.º 20
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description='extent stats via Nuxeo REST API')
    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    file_dict = {}
    conn = s3.connect_to_region('us-west-2',
                                calling_format=OrdinaryCallingFormat())
    bucket = conn.get_bucket('data.nuxeo.cdlib.org.oregon')
    for count, key in enumerate(bucket.list()):
        file_dict[key.name] = key.size
        if count % 50000 == 0:
            print('{0} s3 files memorized'.format(count))

    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())

    documents = nx.nxql('select * from Document')

    row = 0
    for document in documents:
        for blob in blob_from_doc(document):
            if blob:
                s3_size = file_dict.get(blob['digest'], None)
                if not s3_size:
                    print('{0} from {1} {2} not found in S3'.format(
                        blob['digest'], blob['path'], blob['xpath']))
                if file_dict.get(blob['digest'], 0) != int(blob['length']):
                    print(
                        '{0} from {1} {2} s3 size {3} does not match nuxeo size {3}'
                        .format(blob['digest'], blob['path'], blob['xpath'],
                                s3_size, blob['length']))
                if row % 25000 == 0:
                    print('{0} nuxeo blobs checked'.format(row))
                row = row + 1
Exemplo n.º 21
0
def main(argv=None):
    """main"""
    parser = argparse.ArgumentParser(
        description='nuxeo metadata via REST API, one record')
    parser.add_argument('file', nargs=1, help="application/json+nxentity")
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--uid', help="update specific nuxeo uid")
    group.add_argument(
        '--path', help="update specific nuxeo path", type=utf8_arg)
    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    # todo; add these defaults as parameters as well as env
    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())
    pp(argv.file[0])
    jfile = argv.file[0]
    uid = argv.uid
    path = argv.path
    json_data = open(jfile)
    data = json.load(json_data)
    ret = {}
    if uid:  # use uid supplied at command line
        ret = nx.update_nuxeo_properties(data, uid=uid)
    elif path:  # use path supplied at command line
        ret = nx.update_nuxeo_properties(data, path=path)
    # if no uid nor path was specified on the command line, then
    # prefer "path": to "uid": when importing files because the file may have
    # come from another machine where the uuids are different
    else:
        uid = nx.get_uid(data.get('path')) or data.get('uid')
        ret = nx.update_nuxeo_properties(data, uid=uid)
    if not ret:
        print("no uid found, specify --uid or --path")
        exit(1)
    pp(ret)
Exemplo n.º 22
0
 def setUp(self):
     self.nx = utils.Nuxeo({
         'api': 'http://mockme/r',
     },
                           rcfile=io.BytesIO(bytes()))
Exemplo n.º 23
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description='extent stats via Nuxeo REST API')
    parser.add_argument('path', nargs=1, help="root path")
    parser.add_argument(
        'outdir',
        nargs=1,
    )
    parser.add_argument('--no-s3-check', dest='s3_check', action='store_false')
    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    # look up all the files in S3, so we can double check that all
    # the files exist as we loop through Nuxeo
    file_check = None
    s3_bytes = s3_count = 0
    if argv.s3_check:
        from boto import s3
        from boto.s3.connection import OrdinaryCallingFormat
        file_check = {}
        conn = s3.connect_to_region('us-west-2',
                                    calling_format=OrdinaryCallingFormat())
        bucket = conn.get_bucket('data.nuxeo.cdlib.org.oregon')
        for count, key in enumerate(bucket.list()):
            file_check[key.name] = key.size
            if count % 50000 == 0:
                print('{0} s3 files memorized'.format(count), file=sys.stderr)
            s3_bytes = s3_bytes + key.size
        s3_count = len(file_check)

    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())

    campuses = nx.children(argv.path[0])

    summary_workbook = xlsxwriter.Workbook(
        os.path.join(argv.outdir[0], '{}-summary.xlsx'.format(today)))
    # cell formats
    header_format = summary_workbook.add_format({
        'bold': True,
    })
    number_format = summary_workbook.add_format()
    number_format.set_num_format('#,##0')

    summary_worksheet = summary_workbook.add_worksheet('summary')
    # headers
    summary_worksheet.write(0, 1, 'deduplicated files', header_format)
    summary_worksheet.write(0, 2, 'deduplicated bytes', header_format)
    summary_worksheet.write(0, 4, 'total files', header_format)
    summary_worksheet.write(0, 5, 'total bytes', header_format)
    if argv.s3_check:
        summary_worksheet.write(0, 7, 'files on S3', header_format)
        summary_worksheet.write(0, 8, 'bytes on S3', header_format)
    # widths
    summary_worksheet.set_column(
        0,
        1,
        10,
    )
    summary_worksheet.set_column(
        2,
        2,
        25,
    )
    summary_worksheet.set_column(
        3,
        4,
        10,
    )
    summary_worksheet.set_column(
        5,
        5,
        25,
    )
    summary_worksheet.set_column(
        6,
        7,
        10,
    )
    summary_worksheet.set_column(
        8,
        8,
        25,
    )
    summary_worksheet.set_column(
        9,
        9,
        10,
    )
    true_count = dedup_total = total_count = running_total = 0
    row = 1
    for campus in campuses:
        basename = os.path.basename(campus['path'])
        documents = nx.nxql(
            'select * from Document where ecm:path startswith"{0}"'.format(
                campus['path']))
        (this_count, this_total, dedup_count,
         dedup_bytes) = forCampus(documents, basename, file_check,
                                  argv.outdir[0])
        summary_worksheet.write(row, 0, basename)
        summary_worksheet.write(row, 1, dedup_count, number_format)
        summary_worksheet.write(row, 2, dedup_bytes, number_format)
        summary_worksheet.write(row, 3, sizeof_fmt(dedup_bytes))
        summary_worksheet.write(row, 4, this_count, number_format)
        summary_worksheet.write(row, 5, this_total, number_format)
        summary_worksheet.write(row, 6, sizeof_fmt(this_total))
        total_count = total_count + this_count  # number of files
        running_total = running_total + this_total  # number of bytes
        true_count = true_count + dedup_count
        dedup_total = dedup_total + dedup_bytes  # number of bytes
        row = row + 1
    summary_worksheet.write(row, 0, '{}'.format(today))
    summary_worksheet.write(row, 1, true_count, number_format)
    summary_worksheet.write(row, 2, dedup_total, number_format)
    summary_worksheet.write(row, 3, sizeof_fmt(dedup_total))
    summary_worksheet.write(row, 4, total_count, number_format)
    summary_worksheet.write(row, 5, running_total, number_format)
    summary_worksheet.write(row, 6, sizeof_fmt(running_total))
    if argv.s3_check:
        summary_worksheet.write(row, 7, s3_count, number_format)
        summary_worksheet.write(row, 8, s3_bytes, number_format)
        summary_worksheet.write(row, 9, sizeof_fmt(s3_bytes))
    summary_workbook.close()
Exemplo n.º 24
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description='nxidbatch mints a batch of ARKs')

    parser.add_argument('batchsize',
                        nargs=1,
                        help='size of ARK batch',
                        type=int)

    ezid_group = parser.add_argument_group('minting behaviour flags')
    ezid_group.add_argument('--mint',
                            '-m',
                            action='store_true',
                            help='mint ARKs without prompt')
    ezid_group.add_argument('--output',
                            '-o',
                            type=lambda x: is_valid_file(parser, x),
                            required=True)

    conf_group = parser.add_argument_group('EZID configuration and metadata')
    conf_group.add_argument('--ezid-username',
                            help='username for EZID API (overrides rcfile)',
                            type=utf8_arg)
    conf_group.add_argument('--ezid-password',
                            help='password for EZID API (overrides rc file)',
                            type=utf8_arg)
    conf_group.add_argument('--shoulder',
                            help='shoulder (overrides rcfile)',
                            type=utf8_arg)
    conf_group.add_argument('--owner',
                            help='set as _owner for EZID',
                            type=utf8_arg)
    conf_group.add_argument(
        '--status',
        help=
        'set as _status for EZID (default reserved, or public|unavailable)',
        default="reserved",
        type=utf8_arg)
    conf_group.add_argument('--publisher',
                            help='set as dc.publisher for EZID',
                            type=utf8_arg)

    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())

    # read config out of .pynuxrc file
    username = argv.ezid_username or nx.ezid_conf['username']
    password = argv.ezid_password or nx.ezid_conf['password']
    shoulder = argv.shoulder or nx.ezid_conf['shoulder']
    ezid = EZID.EZIDClient(
        credentials=dict(username=username, password=password))

    if argv.mint:
        output = open(argv.output, 'w')
    else:
        # https://stackoverflow.com/a/26514097/1763984
        answer = raw_input(
            'Mint a batch {} of {} ARKs with prefix {} with EZID? [y/n]'.
            format(argv.output, argv.batchsize, shoulder))
        if not answer or answer[0].lower() != 'y':
            print('You did not indicate approval')
            exit(1)
        else:
            output = open(argv.output, 'w')

    for __ in range(argv.batchsize[0]):

        # mint
        new_ark = ezid.mint(shoulder)
        print(new_ark, file=output)

    if not (argv.mint):
        print('done')
Exemplo n.º 25
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description=
        'nxid finds top level objects in Nuxeo and syncs them up with EZID')

    parser.add_argument('path',
                        nargs=1,
                        help='nuxeo path (folder or object)',
                        type=utf8_arg)

    ezid_group = parser.add_argument_group('minting behaviour flags')
    ezid_group.add_argument(
        '--mint',
        '-m',
        action='store_true',
        help='when an ARK is missing, mint and bind new ARK in EZID')
    ezid_group.add_argument(
        '--create',
        '-c',
        action='store_true',
        help='when an ARK is found in Nuxeo but not EZID, create EZID')
    ezid_group.add_argument(
        '--update',
        '-u',
        action='store_true',
        help='when an ARK is found in Nuxeo and EZID, update EZID')
    ezid_group.add_argument(
        '--no-noop-report',
        action='store_true',
        help='override default behaviour of reporting on noops')
    ezid_group.add_argument('--show-erc',
                            action='store_true',
                            help='show ANVL record that will be sent to EZID')

    conf_group = parser.add_argument_group('EZID configuration and metadata')
    conf_group.add_argument('--ezid-username',
                            help='username for EZID API (overrides rcfile)',
                            type=utf8_arg)
    conf_group.add_argument('--ezid-password',
                            help='password for EZID API (overrides rc file)',
                            type=utf8_arg)
    conf_group.add_argument('--shoulder',
                            help='shoulder (overrides rcfile)',
                            type=utf8_arg)
    conf_group.add_argument('--owner',
                            help='set as _owner for EZID',
                            type=utf8_arg)
    conf_group.add_argument(
        '--status',
        help='set as _status for EZID (public|reserved|unavailable)',
        type=utf8_arg)
    conf_group.add_argument('--publisher',
                            help='set as dc.publisher for EZID',
                            type=utf8_arg)
    conf_group.add_argument('--location',
                            help='set location URL prefix for EZID',
                            type=utf8_arg)

    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())

    # read config out of .pynuxrc file
    username = argv.ezid_username or nx.ezid_conf['username']
    password = argv.ezid_password or nx.ezid_conf['password']
    shoulder = argv.shoulder or nx.ezid_conf['shoulder']
    ezid = EZID.EZIDClient(
        credentials=dict(username=username, password=password))

    # query to select all parent level objects
    documents = nx.nxql(u'''
SELECT * FROM SampleCustomPicture, CustomFile, CustomVideo, CustomAudio
WHERE ecm:path STARTSWITH "{}"
AND ecm:currentLifeCycleState != "deleted"
AND ecm:pos is NULL'''.format(argv.path[0]))

    # if the user gives the full path to a document
    if not any(
            True
            for _ in documents):  # https://stackoverflow.com/a/3114640/1763984
        documents = nx.nxql(u'''
SELECT * FROM SampleCustomPicture, CustomFile, CustomVideo, CustomAudio
WHERE ecm:path = "{}"
AND ecm:currentLifeCycleState != "deleted"
AND ecm:pos is NULL'''.format(argv.path[0]))

    report = not (argv.no_noop_report)

    # main loop
    for item in documents:
        # check id for ARK
        ark = find_ark(item['properties']['ucldc_schema:identifier'])
        path = item['path']

        # if there is an ARK, check for a record in EZID
        ezid_status = None
        if ark is not None:
            ezid_status = check_ezid(ark, ezid)

        ezdata = item_erc_dict(
            item,
            owner=argv.owner,  # _owner
            status=argv.status,  # _status
            publisher=argv.publisher,  # dc.publisher
            location=argv.location  # _target
        )

        if argv.show_erc:
            print(EZID.formatAnvlFromDict(ezdata))
            print('')

        # mint
        if not (ark) and not (ezid_status):
            if argv.mint:
                new_ark = ezid.mint(shoulder, ezdata)
                update_nuxeo(item, nx, new_ark)
                print('✓ mint "{}" {}'.format(path, new_ark))
            elif report:
                print('ℹ noop mint "{}"'.format(path))

        # create
        if ark and not (ezid_status):
            if argv.create:
                ezid.create(ark, ezdata)
                print('✓ create "{}" {}'.format(path, ark))
            elif report:
                print('ℹ noop create "{}" {}'.format(path, ark))

        # update
        if ark and ezid_status:
            owner = get_owner(ezid_status)
            if argv.update:
                ezid.update(ark, ezdata)
                print('✓ update "{}" {}'.format(path, ark))
            elif report:
                print('ℹ noop update "{}" {} {}'.format(path, ark, owner))
Exemplo n.º 26
0
def main(argv=None):

    parser = argparse.ArgumentParser(description='nuxeo metadata via REST API')
    parser.add_argument('path', nargs=1, help="nuxeo document path")
    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())
    documents = nx.children(argv.path[0])

    # open the workbook
    workbook = xlsxwriter.Workbook('qa.xlsx')
    header_format = workbook.add_format({
        'bold': True,
    })

    report = workbook.add_worksheet()

    report.set_column(
        0,
        0,
        10,
    )
    report.set_column(
        1,
        2,
        40,
    )
    report.set_column(
        3,
        4,
        80,
    )

    report.write(0, 0, 'nuxeo-uid', header_format)
    report.write(0, 1, 'ucldc_schema:localidentifier', header_format)
    report.write(0, 2, 'filename', header_format)
    report.write(0, 3, 'nuxeo-path', header_format)
    report.write(0, 4, 'title', header_format)

    # document specified on command line
    root_doc = nx.get_metadata(path=argv.path[0])

    report.write(1, 0, root_doc['uid'])
    report.write(1, 3, argv.path[0])

    row = 2
    for document in documents:

        p = document['properties']

        report.write(row, 0, document['uid'])
        report.write(row, 1, p['ucldc_schema:localidentifier'][0])
        if 'file:filename' in p:
            report.write(row, 2, p['file:filename'])
        report.write(row, 3, document['path'].replace(argv.path[0], '', 1))
        report.write(row, 4, document['title'])
        row = row + 1

    # output
    #  path|localid|title
    #

    workbook.close()
Exemplo n.º 27
0
def main(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument("--datafile",
                        type=str,
                        required=True,
                        help="CSV data input file -- required")

    parser.add_argument("--rcfile",
                        type=str,
                        required=True,
                        help="Pynux resource file -- required")

    parser.add_argument("--loglevel",
                        type=str,
                        required=False,
                        default="INFO",
                        help="Set Pynux logging level")

    try:
        args = parser.parse_args()
    except:
        print "Unable to parse input parameters..."
        sys.exit(2)

    try:
        assert os.path.isfile(args.datafile)
    except AssertionError:
        print "Not a file: ", args.datafile
        sys.exit(2)

    try:
        assert os.path.isfile(args.rcfile)
    except AssertionError:
        print "Not a file: ", args.rcfile
        sys.exit(2)

    csv_data_file = args.datafile
    print csv_data_file
    print args.rcfile
    print args.loglevel

    nx = utils.Nuxeo(rcfile=args.rcfile, loglevel=args.loglevel.upper())
    nuxeo_limit = 24

    # get and instance of the Csv2Dict class which must be initialized
    # with the name of an input data (csv) file

    csv2dict = Csv2Dict(csv_data_file)

    if csv2dict.status != 0:
        print 'The Csv2Dict constructor reported and error (%d).' % csv2dict.status
        sys.exit(csv2dict.status)

    process_rows(csv2dict)

    for n in range(csv2dict.get_meta_dict_length()):
        print "Loading payload %d" % n
        payload = csv2dict.get_meta_dict(n)
        print payload['path']
        uid = nx.get_uid(payload['path'])
        print "Returned UID: %d) %s." % (n, uid)
        print payload
        print payload['path']
        nx.update_nuxeo_properties(payload, path=payload['path'])

    csv2dict.print_meta_dicts('LOGS/latest_output.txt')
Exemplo n.º 28
0
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='print info for items in collection where media.json '
                    'file is missing.'
    )
    parser.add_argument('path', help="Nuxeo document path for collection")
    parser.add_argument('bucket', help="S3 bucket name")
    parser.add_argument("--pynuxrc", default='~/.pynuxrc',
                        help="rc file for use by pynux")
    parser.add_argument(
        '--stash',
        action="store_true",
        help="create and stash missing media.json file")
    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    nuxeo_path = argv.path
    bucketpath = argv.bucket
    pynuxrc = argv.pynuxrc
    stash = argv.stash



    print("collection nuxeo_path:", nuxeo_path)

    # get the Nuxeo ID for the collection
    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())
    nuxeo_id = nx.get_uid(nuxeo_path)
    print("collection nuxeo_id:", nuxeo_id)

    # connect to S3
    conn = connect_s3(calling_format=OrdinaryCallingFormat())
    bucketpath = bucketpath.strip("/")
    bucketbase = bucketpath.split("/")[0]
    print("bucketpath:", bucketpath)
    print("bucketbase:", bucketbase)

    try:
        bucket = conn.get_bucket(bucketbase)
    except boto.exception.S3ResponseError:
        print("bucket doesn't exist on S3:", bucketbase)

    items = nx.children(nuxeo_path)

    for item in items:
        obj_key = "{0}-media.json".format(item['uid'])
        s3_url = "s3://{0}/{1}".format(bucketpath, obj_key)
        #print "s3_url:", s3_url
        parts = urlparse.urlsplit(s3_url)
        #print "obj_key", obj_key
        #print "s3_url", s3_url

        if item['type'] != 'Organization' and not (bucket.get_key(parts.path)):
            print("object doesn't exist on S3:", parts.path, item['path'])
            if stash:
               nxstash = NuxeoStashMediaJson(
                  item['path'],
                  MEDIA_JSON_BUCKET,
                  MEDIA_JSON_REGION,
                  pynuxrc,
                  True)
               nxstash.nxstashref()
               print("stashed for item['path']")
        '''
Exemplo n.º 29
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description="extent stats via Nuxeo REST API")
    parser.add_argument(
        "outdir",
        nargs=1,
    )
    parser.add_argument("--no-s3-check", dest="s3_check", action="store_false")
    utils.get_common_options(parser)
    if argv is None:
        argv = parser.parse_args()

    os.makedirs(argv.outdir[0], exist_ok=True)

    # look up all the files in S3, so we can double check that all
    # the files exist as we loop through Nuxeo
    file_check = None
    s3_bytes = s3_count = 0
    if argv.s3_check:
        from boto import s3
        from boto.s3.connection import OrdinaryCallingFormat

        file_check = {}
        conn = s3.connect_to_region("us-west-2",
                                    calling_format=OrdinaryCallingFormat())
        bucket = conn.get_bucket("data.nuxeo.cdlib.org.oregon")
        for count, key in enumerate(bucket.list()):
            file_check[key.name] = key.size
            if count % 50000 == 0:
                print("{0} s3 files memorized".format(count), file=sys.stderr)
            s3_bytes = s3_bytes + key.size
        s3_count = len(file_check)

    nx = utils.Nuxeo(rcfile=argv.rcfile, loglevel=argv.loglevel.upper())

    campuses = [
        "UCB",
        "UCD",
        "UCI",
        "UCLA",
        "UCM",
        "UCOP",
        "UCR",
        "UCSB",
        "UCSC",
        "UCSD",
        "UCSF",
    ]

    summary_workbook = xlsxwriter.Workbook(
        os.path.join(argv.outdir[0], "{}-summary.xlsx".format(today)))
    # cell formats
    header_format = summary_workbook.add_format({
        "bold": True,
    })
    number_format = summary_workbook.add_format()
    number_format.set_num_format("#,##0")

    summary_worksheet = summary_workbook.add_worksheet("summary")
    # headers
    summary_worksheet.write(0, 1, "deduplicated files", header_format)
    summary_worksheet.write(0, 2, "deduplicated bytes", header_format)
    summary_worksheet.write(0, 4, "total files", header_format)
    summary_worksheet.write(0, 5, "total bytes", header_format)
    if argv.s3_check:
        summary_worksheet.write(0, 7, "files on S3", header_format)
        summary_worksheet.write(0, 8, "bytes on S3", header_format)
    # widths
    summary_worksheet.set_column(
        0,
        1,
        10,
    )
    summary_worksheet.set_column(
        2,
        2,
        25,
    )
    summary_worksheet.set_column(
        3,
        4,
        10,
    )
    summary_worksheet.set_column(
        5,
        5,
        25,
    )
    summary_worksheet.set_column(
        6,
        7,
        10,
    )
    summary_worksheet.set_column(
        8,
        8,
        25,
    )
    summary_worksheet.set_column(
        9,
        9,
        10,
    )
    true_count = dedup_total = total_count = running_total = 0
    row = 1
    for campus in campuses:
        (this_count, this_total, dedup_count,
         dedup_bytes) = forCampus(campus, file_check, argv.outdir[0], nx)
        # write out this row in the sheet
        summary_worksheet.write(row, 0, campus)
        summary_worksheet.write(row, 1, dedup_count, number_format)
        summary_worksheet.write(row, 2, dedup_bytes, number_format)
        summary_worksheet.write(row, 3, sizeof_fmt(dedup_bytes))
        summary_worksheet.write(row, 4, this_count, number_format)
        summary_worksheet.write(row, 5, this_total, number_format)
        summary_worksheet.write(row, 6, sizeof_fmt(this_total))

        # keep track of running totals
        total_count = total_count + this_count  # number of files
        running_total = running_total + this_total  # number of bytes
        true_count = true_count + dedup_count
        dedup_total = dedup_total + dedup_bytes  # number of bytes
        row = row + 1

    # write totals in the summary worksheet
    summary_worksheet.write(row, 0, "{}".format(today))
    summary_worksheet.write(row, 1, true_count, number_format)
    summary_worksheet.write(row, 2, dedup_total, number_format)
    summary_worksheet.write(row, 3, sizeof_fmt(dedup_total))
    summary_worksheet.write(row, 4, total_count, number_format)
    summary_worksheet.write(row, 5, running_total, number_format)
    summary_worksheet.write(row, 6, sizeof_fmt(running_total))
    if argv.s3_check:
        summary_worksheet.write(row, 7, s3_count, number_format)
        summary_worksheet.write(row, 8, s3_bytes, number_format)
        summary_worksheet.write(row, 9, sizeof_fmt(s3_bytes))
    summary_workbook.close()