Python DeepHarvestNuxeo.DeepHarvestNuxeo 예제들, deepharvest.deepharvest_nuxeo.DeepHarvestNuxeo.DeepHarvestNuxeo Python 예제들

예제 #1

0

파일 보기

파일: list_docs_modified_since.py 프로젝트: barbarahui/nuxeo-calisphere

def main(argv=None):

    parser = argparse.ArgumentParser(
        description='Print count of objects for a given collection.')
    parser.add_argument('path', help="Nuxeo path to collection")
    parser.add_argument(
        'since_date',
        help=
        "Script will list docs updated since midnight on this date, GMT. Format YYYY-MM-DD",
        type=valid_date)
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc',
                        help="rcfile for use with pynux utils")
    parser.add_argument('--components',
                        action='store_true',
                        help="show counts for object components")
    if argv is None:
        argv = parser.parse_args()

    dh = DeepHarvestNuxeo(argv.path, '', pynuxrc=argv.pynuxrc)
    print "about to fetch docs for path {}".format(dh.path)
    objects = dh.fetch_objects()

    component_count = 0
    for obj in objects:
        last_mod_str = obj['lastModified'][:10]
        last_mod_date = parse(last_mod_str)
        if last_mod_date > argv.since_date:
            print last_mod_str, obj['path']
        '''

예제 #2

0

파일 보기

파일: get_collection_object_count.py 프로젝트: mredar/nuxeo-calisphere

def main(argv=None):

    parser = argparse.ArgumentParser(
        description='Print count of objects for a given collection.')
    parser.add_argument('path', help="Nuxeo path to collection")
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc-prod',
                        help="rcfile for use with pynux utils")
    parser.add_argument('--components',
                        action='store_true',
                        help="show counts for object components")
    if argv is None:
        argv = parser.parse_args()

    dh = DeepHarvestNuxeo(argv.path, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    object_count = len(objects)
    print "finished fetching objects. {} found".format(object_count)

    if not argv.components:
        return

    print "about to iterate through objects and get components"
    component_count = 0
    for obj in objects:
        components = dh.fetch_components(obj)
        component_count = component_count + len(components)
    print "finished fetching components. {} found".format(component_count)
    print "Grand Total: {}".format(object_count + component_count)

예제 #3

0

파일 보기

파일: diff_nx_couch.py 프로젝트: mredar/nuxeo-calisphere

def main(argv=None):
    parser = argparse.ArgumentParser(
        description='print differences between Nuxeo and CouchDB for a '
                    'given collection'
    )
    parser.add_argument('regid', help="Collection Registry ID")
    parser.add_argument(
        '--pynuxrc',
        default='~/.pynuxrc-basic',
        help="rcfile for use with pynux utils")

    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.regid
    couch = get_couch_objects(registry_id)
    print('couch has {} objects'.format(len(couch)))

    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    if nxpath is None:
        print "No record found for registry_id: {}".format(registry_id)
        sys.exit()

    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    for obj in dh.fetch_objects():
        incouch = True if obj['uid'] in couch else False
        if not incouch:
            print(obj['uid'])

예제 #4

0

파일 보기

파일: stash_collection_chunks.py 프로젝트: mredar/nuxeo-calisphere

    def deepharvest(self, metadata):
        ''' given a set of nuxeo metadata for a doc, deep harvest it '''

        self.logger.info("Processing {}".format(metadata['uid']))
     
        dh = DeepHarvestNuxeo('')
        type = dh.get_calisphere_object_type(metadata['type'])
        self.logger.info("Type: {}".format(type))


        report = {}
        if type == 'image':
            ''' stash image '''
            nxstash = NuxeoStashImage(metadata['path'], IMAGE_BUCKET, IMAGE_REGION,
                                      self.pynuxrc, self.replace, metadata=metadata)
            report[nxstash.uid] = nxstash.nxstashref()
        
        print report
 
        if type in ['file', 'audio', 'video']:
            # stash file
            nxstash = NuxeoStashFile(metadata['path'], FILE_BUCKET, FILE_REGION,
                                     self.pynuxrc, self.replace, metadata=metadata)
            report[nxstash.uid] = nxstash.nxstashref()

            # stash thumbnail
            nxstash = NuxeoStashThumb(metadata['path'], THUMB_BUCKET, THUMB_REGION,
                                      self.pynuxrc, self.replace, metadata=metadata)
            report[nxstash.uid] = nxstash.nxstashref()

        print report

        # stash media.json
        '''

예제 #5

0

파일 보기

def main(argv=None):
    parser = argparse.ArgumentParser(description='Create ATOM feed for a given Nuxeo folder for Merritt harvesting')
    parser.add_argument("collection", help="UCLDC Registry Collection ID")
    parser.add_argument("--pynuxrc", help="rc file for use by pynux")
    if argv is None:
        argv = parser.parse_args()
    collection_id = argv.collection

    if argv.pynuxrc:
        ma = MerrittAtom(collection_id, argv.pynuxrc)
    else:
        ma = MerrittAtom(collection_id)

    print "atom_file: {}".format(ma.atom_file)

    if argv.pynuxrc:
        dh = DeepHarvestNuxeo(ma.path, '', pynuxrc=argv.pynuxrc)
    else:
        dh = DeepHarvestNuxeo(ma.path, '')

    print "Fetching Nuxeo docs. This could take a while if collection is large..."
    documents = dh.fetch_objects()
    # TODO: fetch components also

    # create root
    root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP)

    # add entries
    for document in documents:
        nxid = document['uid']
        print "constructing entry for {} {}".format(nxid, document['path'])
        nx_metadata = ma._extract_nx_metadata(nxid)
        entry = etree.Element(etree.QName(ATOM_NS, "entry"))
        entry = ma._populate_entry(entry, nx_metadata, nxid)        
        root.insert(0, entry)

    # add header info
    print "Adding header info to xml tree"
    ma._add_merritt_id(root, ma.merritt_id)
    ma._add_paging_info(root)
    ma._add_collection_alt_link(root, ma.path)
    ma._add_atom_elements(root)
    ma._add_feed_updated(root, ma.last_update)

    ma._publish_feed(root)

예제 #6

0

파일 보기

파일: stash_collection_mediajson.py 프로젝트: mredar/nuxeo-calisphere

def main(argv=None):
    ''' create and stash media.json files for a nuxeo collection '''

    parser = argparse.ArgumentParser(description='Create and stash media.json'
                                     'files for a nuxeo collection')
    parser.add_argument("path", help="Nuxeo document path")
    parser.add_argument(
        "--bucket",
        default="static.ucldc.cdlib.org/media_json",
        help="S3 bucket where media.json files will be stashed")
    parser.add_argument('--region', default='us-east-1', help="aws region")
    parser.add_argument("--pynuxrc",
                        default='~/.pynuxrc',
                        help="rc file for use by pynux")
    parser.add_argument('--replace',
                        action="store_true",
                        help="replace file on s3 if it already exists")

    if argv is None:
        argv = parser.parse_args()

    collection = argv.path.split('/')[-1]

    # logging
    logfile = 'logs/mediajson-{}.log'.format(collection)
    print "LOG:\t{}".format(logfile)
    logging.basicConfig(
        filename=logfile,
        level=logging.INFO,
        format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logger = logging.getLogger(__name__)

    dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc)

    report = {}

    objects = dh.fetch_objects()
    for obj in objects:
        nxstash = NuxeoStashMediaJson(obj['path'], argv.bucket, argv.region,
                                      argv.pynuxrc, argv.replace)
        report[nxstash.uid] = nxstash.nxstashref()

    # output report to json file
    reportfile = "reports/mediajson-{}.json".format(collection)
    with open(reportfile, 'w') as f:
        json.dump(report, f, sort_keys=True, indent=4)

    # parse report to give basic stats
    report = json.load(open(reportfile))
    print "REPORT:\t{}".format(reportfile)
    print "SUMMARY:"
    print "processed:\t{}".format(len(report))

예제 #7

0

파일 보기

파일: list_collection_objects.py 프로젝트: mredar/nuxeo-calisphere

def main(argv=None):

    parser = argparse.ArgumentParser(
        description='list objects for a given collection.')
    parser.add_argument('registry_id', help='UCLDC Registry ID')
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc-basic',
                        help="rcfile for use with pynux utils")
    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.registry_id

    # get nuxeo path
    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    if nxpath is None:
        print "No record found for registry_id: {}".format(registry_id)
        sys.exit()

    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    object_count = len(objects)
    print "finished fetching objects. {} found".format(object_count)

    print "about to iterate through objects and get components"
    component_count = 0
    all_components = []
    for obj in objects:
        components = dh.fetch_components(obj)
        all_components.extend(components)
        print "{} components for {}".format(len(components), obj['uid'])
    print "finished fetching components. {} found".format(len(all_components))

    objects.extend(all_components)
    total_obj = len(objects)
    print "Grand Total: {}".format(total_obj)

    # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    chunks = [
        objects[i:i + PER_PAGE] for i in xrange(0, len(objects), PER_PAGE)
    ]

    count = 0
    for c in chunks:
        count = count + 1
        filepath = 'chunks/{}_{}.txt'.format(registry_id, count)
        print "Writing file: {}".format(filepath)
        with open(filepath, 'w') as f:
            json.dump(c, f, indent=4)

예제 #8

0

파일 보기

    def __init__(self, path, pynuxrc, replace=False, loglevel=_loglevel_):
        self.logger = logging.getLogger(__name__)

        self.path = path
        self.pynuxrc = pynuxrc
        self.replace = replace

        self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=self.pynuxrc)

        self.objects = self.fetch_objects()

        self.components = {}
        for obj in self.objects:
            self.components[obj['uid']] = self.dh.fetch_components(obj)

예제 #9

0

파일 보기

파일: nxstash_mediajson.py 프로젝트: mredar/nuxeo-calisphere

    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=True,
                 **kwargs):
        super(NuxeoStashMediaJson, self).__init__(path, bucket, region,
                                                  pynuxrc, replace, **kwargs)

        self.dh = DeepHarvestNuxeo(
            self.path, self.bucket, pynuxrc=self.pynuxrc)
        self.mj = MediaJson()

        self.filename = FILENAME_FORMAT.format(self.uid)
        self.filepath = os.path.join(self.tmp_dir, self.filename)
        self._update_report('filename', self.filename)
        self._update_report('filepath', self.filepath)

예제 #10

0

파일 보기

파일: queue_deep_harvest_single_object_jobs.py 프로젝트: ucldc/harvester

def main(collection_ids,
         rq_queue='dh-q',
         config=None,
         pynuxrc=None,
         replace=False,
         timeout=JOB_TIMEOUT,
         log_handler=None):
    ''' Queue a deep harvest of a nuxeo object on a worker'''
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')
    log_handler.push_application()
    log = logbook.Logger('QDH')
    for cid in [x for x in collection_ids.split(';')]:
        url_api = ''.join(
            ('https://registry.cdlib.org/api/v1/collection/', cid, '/'))
        coll = Collection(url_api)

        dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc)

        for obj in dh.fetch_objects():
            log.info('Queueing TOPLEVEL {} :-: {}'.format(
                obj['uid'], obj['path']))
            # deep harvest top level object
            queue_deep_harvest_path(config['redis_host'],
                                    config['redis_port'],
                                    config['redis_password'],
                                    config['redis_connect_timeout'],
                                    rq_queue=rq_queue,
                                    path=obj['path'],
                                    replace=replace,
                                    timeout=timeout)
            # deep harvest component sub-objects
            for c in dh.fetch_components(obj):
                log.info('Queueing {} :-: {}'.format(c['uid'], c['path']))
                queue_deep_harvest_path(config['redis_host'],
                                        config['redis_port'],
                                        config['redis_password'],
                                        config['redis_connect_timeout'],
                                        rq_queue=rq_queue,
                                        path=c['path'],
                                        replace=replace,
                                        timeout=timeout)

    log_handler.pop_application()

예제 #11

0

파일 보기

파일: nxstashref.py 프로젝트: mredar/nuxeo-calisphere

    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=False,
                 **kwargs):

        self.logger = logging.getLogger(__name__)

        self.path = path
        self.bucket = bucket
        self.pynuxrc = pynuxrc
        self.region = region
        self.replace = replace

        self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r'))

        if 'metadata' in kwargs:
            self.metadata = kwargs['metadata']
            self.logger.info("got metadata from kwargs")
        else:
            self.metadata = self.nx.get_metadata(path=self.path)
            self.logger.info("got metadata via pynux utils")

        self.uid = self.metadata['uid']

        self.logger.info("initialized NuxeoStashRef with path {}".format(
            self.path.encode('ascii', 'replace')))

        self.dh = DeepHarvestNuxeo(self.path, uid=self.uid)
        self.calisphere_type = self.dh.get_calisphere_object_type(
            self.metadata['type'])
        self.tmp_dir = tempfile.mkdtemp(dir='/tmp')  # FIXME put in conf

        self.report = {}
        self._update_report('uid', self.uid)
        self._update_report('path', self.path)
        self._update_report('bucket', self.bucket)
        self._update_report('replace', self.replace)
        self._update_report('pynuxrc', self.pynuxrc)
        self._update_report('calisphere_type', self.calisphere_type)

예제 #12

0

파일 보기

def main(argv=None):

    parser = argparse.ArgumentParser(
        description=
        'Print info on objects missing from couchdb for Nuxeo collection')
    parser.add_argument('id', help='Collection registry ID')
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc',
                        help="rcfile for use with pynux utils")
    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.id
    print "Registry ID: {}".format(registry_id)

    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    print "Nuxeo path: {}".format(nxpath)

    # get couchdb data
    view = "https://harvest-stg.cdlib.org/couchdb/ucldc/_design/all_provider_docs/_view/by_provider_name?key=%22{}%22".format(
        registry_id)
    print view
    res = requests.get(view, verify=False)  # FIXME we want to verify
    res.raise_for_status()
    couchdata = json.loads(res.content)
    rows = couchdata['rows']
    delimiter = "{}--".format(registry_id)
    couch_uids = [row['id'].split(delimiter)[1] for row in rows]
    couch_count = len(couch_uids)
    print "Total rows in couchdb: {}".format(couch_count)

    # get nuxeo data
    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    nx_count = len(objects)
    print "Total objects in Nuxeo: {}".format(nx_count)

    for obj in objects:
        if obj['uid'] not in couch_uids:
            print obj['uid'], obj['path']

예제 #13

0

파일 보기

def main(argv=None):

    parser = argparse.ArgumentParser(
        description=
        'list objects for a given collection where nuxeo doc type is image but file type is pdf'
    )
    parser.add_argument('registry_id', help='UCLDC Registry ID')
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc-basic',
                        help="rcfile for use with pynux utils")
    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.registry_id

    # get nuxeo path
    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    if nxpath is None:
        print "No record found for registry_id: {}".format(registry_id)
        sys.exit()

    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    object_count = len(objects)
    print "finished fetching objects. {} found".format(object_count)

    convert = Convert()
    counter = 0
    for obj in objects:
        if dh.has_file(obj) and obj['type'] == u'SampleCustomPicture' and obj[
                'properties']['file:content'][
                    'mime-type'] == u'application/pdf':
            print obj['uid'], obj['path'], obj['type'], obj['properties'][
                'file:content']['name']
            counter = counter + 1

    print counter

예제 #14

0

파일 보기

    def __init__(self, url_harvest, extra_data, conf_pynux={}, **kwargs):
        '''
        uses pynux (https://github.com/ucldc/pynux) to grab objects from
        the Nuxeo API

        api url is set from url_harvest, overriding pynuxrc config and
        passed in conf.

        the pynux config file should have user & password
        and X-NXDocumemtProperties values filled in.
        '''
        super(NuxeoFetcher, self).__init__(url_harvest, extra_data, **kwargs)
        self._url = url_harvest
        self._path = extra_data
        self._nx = pynux.utils.Nuxeo(conf=conf_pynux)
        self._nx.conf['api'] = self._url
        self._structmap_bucket = STRUCTMAP_S3_BUCKET

        # get harvestable child objects
        conf_pynux['api'] = self._url
        self._dh = DeepHarvestNuxeo(self._path, '', conf_pynux=conf_pynux)

        self._children = iter(self._dh.fetch_objects())

예제 #15

0

파일 보기

파일: stash_collection_files.py 프로젝트: mredar/nuxeo-calisphere

def main(argv=None):
    ''' stash Nuxeo files of type 'file', 'audio', or 'video'
    for a collection '''
    parser = argparse.ArgumentParser(
        description='For Nuxeo collection, stash files (pdf, txt, etc) in S3.')
    parser.add_argument('path', help="Nuxeo document path to collection")
    parser.add_argument(
        '--bucket', default='ucldc-nuxeo-ref-media', help="S3 bucket name")
    parser.add_argument('--region', default='us-west-2', help="aws region")
    parser.add_argument(
        '--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux")
    parser.add_argument(
        '--replace',
        action="store_true",
        help="replace file on s3 if it already exists")
    if argv is None:
        argv = parser.parse_args()

    collection = argv.path.split('/')[-1]

    # logging
    logfile = 'logs/{}.log'.format(collection)
    print "LOG:\t{}".format(logfile)
    logging.basicConfig(
        filename=logfile,
        level=logging.INFO,
        format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logger = logging.getLogger(__name__)

    dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc)

    report = {}

    objects = dh.fetch_objects()
    for obj in objects:
        nxstash = NuxeoStashFile(obj['path'], argv.bucket, argv.region,
                                 argv.pynuxrc, argv.replace)
        report[nxstash.uid] = nxstash.nxstashref()
        for c in dh.fetch_components(obj):
            nxstash = NuxeoStashFile(c['path'], argv.bucket, argv.region,
                                     argv.pynuxrc, argv.replace)
            report[nxstash.uid] = nxstash.nxstashref()

    # output report to json file
    reportfile = "reports/{}.json".format(collection)
    with open(reportfile, 'w') as f:
        json.dump(report, f, sort_keys=True, indent=4)

    # parse report to give basic stats
    report = json.load(open(reportfile))
    print "REPORT:\t{}".format(reportfile)
    print "SUMMARY:"
    print "processed:\t{}".format(len(report))
    not_file = len([
        key for key, value in report.iteritems()
        if not value['calisphere_type'] in VALID_CALISPHERE_TYPES
    ])
    print "not type `file`, `audio` or `video`:\t{}".format(not_file)
    already_stashed = len([
        key for key, value in report.iteritems()
        if 'already_s3_stashed' in value.keys() and value['already_s3_stashed']
    ])
    print "already stashed:\t{}".format(already_stashed)
    stashed = len(
        [key for key, value in report.iteritems() if value['stashed']])
    print "(re)stashed:\t{}".format(stashed)

    print "\nDone."

예제 #16

0

파일 보기

파일: merritt-atom.py 프로젝트: tingletech/ucldc-merritt

def main(argv=None):
    parser = argparse.ArgumentParser(
        description=
        'Create ATOM feed for a given Nuxeo folder for Merritt harvesting')
    parser.add_argument("collection", help="UCLDC Registry Collection ID")
    parser.add_argument("--pynuxrc", help="rc file for use by pynux")
    if argv is None:
        argv = parser.parse_args()
    collection_id = argv.collection

    if argv.pynuxrc:
        ma = MerrittAtom(collection_id, argv.pynuxrc)
    else:
        ma = MerrittAtom(collection_id)

    print "atom_file: {}".format(ma.atom_file)
    print "ma.path: {}".format(ma.path)

    if argv.pynuxrc:
        dh = DeepHarvestNuxeo(ma.path, '', pynuxrc=argv.pynuxrc)
    else:
        dh = DeepHarvestNuxeo(ma.path, '')

    print "Nuxeo path: {}".format(ma.path)
    print "Fetching Nuxeo docs. This could take a while if collection is large..."
    documents = dh.fetch_objects()

    # create root
    root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP)

    # add entries
    for document in documents:
        nxid = document['uid']
        print "working on document: {} {}".format(nxid, document['path'])

        # parent
        entry = ma._construct_entry(nxid, True)
        print "inserting entry for parent object {} {}".format(
            nxid, document['path'])
        root.insert(0, entry)

        # children
        component_entries = [
            ma._construct_entry(c['uid'], False)
            for c in dh.fetch_components(document)
        ]
        for ce in component_entries:
            print "inserting entry for component: {} {}".format(
                nxid, document['path'])
            root.insert(0, ce)

    # add header info
    print "Adding header info to xml tree"
    ma._add_merritt_id(root, ma.merritt_id)
    ma._add_paging_info(root)
    ma._add_collection_alt_link(root, ma.path)
    ma._add_atom_elements(root)
    ma._add_feed_updated(root, ma.last_update)

    ma._write_feed(root)
    print "Feed written to file: {}".format(ma.atom_file)

    ma._s3_stash()
    print "Feed stashed on s3: {}".format(ma.s3_url)

예제 #17

0

파일 보기

파일: stash_collection_images.py 프로젝트: mredar/nuxeo-calisphere

def main(argv=None):
    ''' stash Nuxeo image files on s3 '''
    parser = argparse.ArgumentParser(
        description='For Nuxeo collection, create jp2 versions of image '
        'files and stash in S3.')
    parser.add_argument('path', help="Nuxeo document path to collection")
    parser.add_argument('--bucket',
                        default='ucldc-private-files/jp2000',
                        help="S3 bucket name")
    parser.add_argument('--region', default='us-west-2', help='AWS region')
    parser.add_argument('--replace',
                        action="store_true",
                        help="replace file on s3 if it already exists")
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc',
                        help="rc file for use by pynux")
    if argv is None:
        argv = parser.parse_args()

    collection = argv.path.split('/')[-1]

    # logging
    logfile = 'logs/{}.log'.format(collection)
    print "LOG:\t{}".format(logfile)
    logging.basicConfig(
        filename=logfile,
        level=logging.INFO,
        format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logger = logging.getLogger(__name__)

    dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc)

    report = {}
    objects = dh.fetch_objects()
    for obj in objects:
        nxstash = NuxeoStashImage(obj['path'], argv.bucket, argv.region,
                                  argv.pynuxrc, argv.replace)
        report[nxstash.uid] = nxstash.nxstashref()
        for c in dh.fetch_components(obj):
            nxstash = NuxeoStashImage(c['path'], argv.bucket, argv.region,
                                      argv.pynuxrc, argv.replace)
            report[nxstash.uid] = nxstash.nxstashref()

    # output report to json file
    reportfile = "reports/{}.json".format(collection)
    with open(reportfile, 'w') as f:
        json.dump(report, f, sort_keys=True, indent=4)

    # parse report to give basic stats
    report = json.load(open(reportfile))
    print "REPORT:\t{}".format(reportfile)
    print "SUMMARY:"
    print "processed:\t{}".format(len(report))
    not_image = len([
        key for key, value in report.iteritems()
        if not value['is_image']['is_image']
    ])
    print "not image:\t{}".format(not_image)
    unrecognized = len([
        key for key, value in report.iteritems()
        if not value['precheck']['pass']
    ])
    print "not convertible:\t{}".format(unrecognized)
    converted = len(
        [key for key, value in report.iteritems() if value['converted']])
    already_stashed = len([
        key for key, value in report.iteritems()
        if 'already_s3_stashed' in value.keys() and value['already_s3_stashed']
    ])
    print "converted:\t{}".format(converted)
    stashed = len(
        [key for key, value in report.iteritems() if value['stashed']])
    print "stashed:\t{}".format(stashed)

    print "\nDone."