def updateCollections(gw_name): import gateway gw_server = gateway.getGatewayInfo()[gw_name]['server'] db = getSession() datasets = {} for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server): for dataset in gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split()): datasets[dataset['id']] = col.id #search for orphans of this gateway for dataset in db.query(Dataset).filter(Dataset.parent_gateway==gw_server).filter(Dataset.parent_id==None): if dataset.id in datasets: dataset.parent_id = datasets[dataset.id] else: #perhaps the top level collection was retracted! print "Dataset has no parent", dataset.id db.commit()
def __update(db): import gateway db.open() gateways = gateway.getGatewayInfo() known_active = ['WDCC', 'PCMDI', 'BADC', 'NCI', 'NCAR'] #only reparse datasets once in a while import datetime timestamp = "modtime > '{0}'".format(datetime.datetime.now() - datetime.timedelta(days=30)) global log def handle_datasets(datasets, gateway): #only process if not empty if not datasets: return datasets total = db._session.query(DatasetDAO).filter(DatasetDAO.parent==datasets[0]['parent']).count() if len(datasets) < total: #Some datasets are not available anymore, we should scan everything again log.error('Dataset were deleted for parent %s, please delete from db.', datasets[0]['parent']) known = set([ds[0] for ds in db._session.query(DatasetDAO.dataset_id).filter(DatasetDAO.gateway==gateway['server']).filter(timestamp)]) new_datasets = [] #get datasets not already checked within a given period for dataset in datasets: if dataset['id'] not in known: new_datasets.append(dataset) log.info('Processing %s new datasets (skipping %s).', len(new_datasets), len(datasets) - len(new_datasets)) return new_datasets global __to_ingest __to_ingest = [] def handle_result(dataset, files, gateway): global __to_ingest #store only MB dataset_size = sum([int(f['size']) for f in files])>>20 files_count= len(files) #we assume all files in dataset have the same access points for endpoint in files[0]['endpoints']: dao = DatasetDAO( dataset, endpoint, gateway, size=dataset_size, files_count=files_count) dao.markAsUpdated() __to_ingest.append(dao) #update if batch's full if len(__to_ingest) > 25: db.addAll(__to_ingest, overwrite=True) __to_ingest = [] for gw_name in gateways: #only parse gateways that are known to be active... if gw_name not in known_active: continue gw_data = gateways[gw_name] log.info('Processing %s', gw_name) __to_ingest=[] try: gateway.getCurrentDatasets(gw_name, filter_datasets=handle_datasets, callback_result=handle_result, continue_on_errors=True) #make sure we don't leave any last ones db.addAll(__to_ingest, overwrite=True) except: import sys log.error('There was an error contacting gateway %s: %s',gw_name, str(sys.exc_info()[:3])) raise db._session.commit()
def processGateway(gw_name): import urllib2, re, xml, gateway db = getSession() gw_url = gateway.getGatewayInfo()[gw_name]['url'] gw_server = gateway.getGatewayInfo()[gw_name]['server'] hlog.debug("jfp gw_url=%s, gw_server=%s",gw_url,gw_server) try: cmip5_id = _getCMIP5Id(gw_url) except: if gw_name == 'NCI': hlog.warn("_getCMIP5Id failed; but recognize gateway and will use 'cmip5'") cmip5_id = 'cmip5' else: print 'No CMIP5 found for Gateway %s. Check manually.(3)' % gw_name return hlog.debug("jfp cmip5_id=%s",cmip5_id) #get all toplevel collections from gateway gw_top = {} hlog.debug("jfp in processGateway, will call gateway.main with args %s", ('-g %s -co' % gw_name).split()) collections = gateway.main(('-g %s -co' % gw_name).split()) if collections==None: collections=[] if collections==2: # quick fix; gateway.main should throw an exception instead collections=[] for tlc in collections: gw_top[tlc['id']] = tlc #get already known collections db_col = {} for col in db.query(Collection).filter(Collection.gateway==gw_server).all(): #within the gateway these are unique db_col[col.id] = col hlog.debug("jfp db_col=%s",db_col) db_ds = {} for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all(): db_ds[ds.id] = ds #now get all CMIP5 datasets page_url = '%s/project/%s/dataset/feed.atom' % (gw_url, cmip5_id) hlog.debug("jfp in processGateway, about to open %s",page_url) # jfp 2012.09.11 This url is unavailable at NCI; the harvest will always fail. # if page_url.find('nci.org.au')>-1: # jfp unreliable site, try the url twice (doesn't help) # try: # print "first attempt..." # page = urllib2.urlopen(page_url,None,120).read() # except Exception as e: # print "...failed: ",e # print "second attempt..." try: page = urllib2.urlopen(page_url).read() except Exception as e: print "exception opening %s in processGateway: %s" % (page_url,e) raise e dom = xml.dom.minidom.parseString(page) counter = 0 #commit after a bunch existing_ds = {} hlog.debug("jfp %s dom entries",len(dom.getElementsByTagName('entry'))) for entry in dom.getElementsByTagName('entry'): id = entry.getElementsByTagName('title')[0].childNodes[0].data timestamp = entry.getElementsByTagName('updated')[0].childNodes[0].data last_update = datetime.datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:6]) #top level and normal are mixed! if id in gw_top: #this is a top level for cmip5! print "Top level found", id if id in db_col: #update col = db_col[id] if last_update > col.modtime: #we know this collection was modified! (not that we car now...) print "Collection modifed! was %s now is %s" % (col.modtime, last_update) col.modtime = last_update else: #add new collection metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split()) if metatdata==None: continue db.add(Collection(gateway=gw_server,id=id,modtime=last_update,state=metadata['state'])) continue #remember this dataset for later existing_ds[id] = True if id in db_ds: #we know this normal dataset! Check if it has changed if db_ds[id].modtime == last_update: #old news... hlog.debug("Unchanged dataset %s, modtime=%s",id,last_update) continue print "Changed dataset found", id, db_ds[id].modtime, last_update hlog.info( "Changed dataset found %s %s %s", id, db_ds[id].modtime, last_update ) #something got changed! old_ds = db_ds[id] old_ds.modtime = last_update else: print "New dataset found", id, " on ", time.ctime() hlog.info( "New dataset found %s on %s" %(id,time.ctime()) ) old_ds = None #new dataset version or something changed! metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split()) if metadata==None or metadata==2: continue hlog.debug("version %s",metadata['version']) #version work around if metadata['state'] == 'retracted': print "retracted dataset" #this got retracted! if old_ds and old_ds.state != metadata['state']: #state changed! old_ds.state = metadata['state'] continue if not metadata['catalog'] or not metadata['version']: print "Can't parse this, no catalog or version!!", metadata continue files = gateway.main(('-g %s --parent %s -fo' % (gw_name, id)).split()) if files==None: files=[] filecount = len(files) if filecount > 0: size = sum([int(f['size']) for f in files]) #we assume this is per dataset defined, and not per file ep = files[filecount/2]['endpoints'] if ep: types = [e['type'] for e in ep] else: types = [] else: #empty dataset?! There are some... size = 0 types = [] if old_ds and int(metadata['version']) == old_ds.version: print "Same version was updated!!" to_check_update = [('access_http', 'HTTPServer' in types), ('access_gridftp', 'GridFTP' in types), ('access_opendap', 'OPeNDAP'in types), ('filecount', filecount), ('size', size)] for var, value in to_check_update: report = "" old_value = old_ds.__dict__[var] if old_value != value: #report and update report += "Changed %s from %s to %s, " % (var, old_value, value) old_ds.__dict__[var] = value continue #Use old_ds instead of creating a new one. elif old_ds: #new version print "New version found %s, last one was %s; on %s" %\ (metadata['version'], old_ds.version, time.ctime()) hlog.info( "New version found %s, last one was %s; on %s" %\ (metadata['version'], old_ds.version, time.ctime()) ) #Definitely a new version of either an existing dataset or a new one. try: #jfp added try/except db.add(Dataset(id=id, version=int(metadata['version']), catalog=metadata['catalog'],\ state=metadata['state'], filecount=filecount, size=size, access_http=\ ('HTTPServer' in types), access_gridftp=('GridFTP' in types),\ access_opendap=('OPeNDAP' in types), modtime=last_update, parent_gateway=gw_server)) db.flush() # jfp will slow things down, but we'll catch problems right away except sqlalchemy.exc.IntegrityError: #jfp added try/except print "exception adding dataset id=",id," version=",metadata['version']," state=",metadata['state'] print "catalog=",metadata['catalog']," modtime=",last_update," parent_gateway=",gw_server print "access_http=",('HTTPServer' in types)," access_gridftp=",('GridFTP' in types) print sys.exc_info()[:2] db.rollback() # jfp mandatory after a failed flush! # raise #jfp Now should be able to continue with other datasets. if counter > 20: #db.commit() counter = 0 else: counter += 1 #db.commit() #Now we must find missing ones, so we delete them properly for col in db_col.values(): for dataset in col.datasets: if not dataset.id in existing_ds: if dataset.state == 'published': dataset.state = 'retracted' print "dataset %s was removed by %s" % (dataset.id,time.ctime()) hlog.info( "dataset %s was removed by %s" %(dataset.id,time.ctime()) ) #db.commit() # print "jfp finished with loop over db_col.values()" datasets = {} for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server): gdatasets = gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split()) if gdatasets==None: gdatasets=[] for dataset in gdatasets: datasets[dataset['id']] = col.id for d in db.new: if d.id in datasets: d.parent_id = datasets[d.id] else: print "problem with", d db.commit() print "jfp committed"
def processGatewayOld(gw_name, fast=True): """Old method for harvesting gateways""" import urllib2, re, xml, gateway db = getSession() gw_url = gateway.getGatewayInfo()[gw_name]['url'] gw_server = gateway.getGatewayInfo()[gw_name]['server'] #skip these skip_top_level = [] try: cmip5_id = _getCMIP5Id(gw_url) except: print 'No CMIP5 found for Gateway %s. Check manually.(2)' % gw_name hlog.warn( 'No CMIP5 found for Gateway %s. Check manually.(2)' % gw_name ) return #get already known collections db_col = {} for col in db.query(Collection).filter(Collection.gateway==gw_server).all(): #within the gateway these are unique db_col[col.id] = col #now get known datasets db_ds = {} for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all(): db_ds['%s#%s' % (ds.id, ds.version)] = ds counter = 0 for col in _getCMIP5Collections(gw_url, cmip5_id): hlog.info( "Processing Collection %s on %s" %(col,time.ctime()) ) if col in skip_top_level: print "Skipping" hlog.info( "Skipping, time is %s" % (time.ctime()) ) continue if not col in db_col: #new collection! hlog.info("New collection %s on %s" % (col,time.ctime())) md = gateway.main(('-g %s --parent %s -mo' % (gw_name, col)).split()) if md==None: continue #use a fictional date for the update so we know later on which #should be latered db.add(Collection(id=col, gateway=gw_server,state=md['state'], modtime=dummy_date)) existing_ds = {} datasets = gateway.main(('-g %s --parent %s -do' % (gw_name,col)).split()) if datasets==None: datasets=[] for dataset in datasets: ds_key = '%s#%s' % (dataset['id'], dataset['version']) #store for later existing_ds[ds_key] = True if ds_key in db_ds: old_ds = db_ds[ds_key] #should we update? (for now don't...) #if int(dataset['version']) == old_ds.version: #same version... we might want to check... but in the common case this won't be necessary #and is extremely expensive for this old way of getting things # continue else: old_ds = None #Avoid reparsing already parsed datasets. The might change! e.g. they can be retracted. #They should be parsed once in a while if fast and old_ds: continue print "Processing dataset", ds_key, " on ", time.ctime() hlog.info( "Processing dataset %s on %s" %(ds_key,time.ctime()) ) metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, dataset['id'])).split()) if not metadata: continue #version work around if metadata['state'] == 'retracted': print "retracted dataset" hlog.info( "retracted dataset" ) #this got retracted! if old_ds and old_ds.state != metadata['state']: #state changed! old_ds.state = metadata['state'] continue if not metadata['catalog'] or not metadata['version']: print "Can't parse this, no catalog or version!!", metadata hlog.info( "Can't parse this, no catalog or version!! %s" %s (metadata) ) continue #this is new! files = gateway.main(('-g %s --parent %s -fo' % (gw_name,dataset['id'])).split()) if files==None: files=[] filecount = len(files) if filecount > 0: size = sum([int(f['size']) for f in files]) #we assume this is per dataset defined, and not per file # use some file in the middle for this ep = files[filecount/2]['endpoints'] if ep: types = [e['type'] for e in ep] else: types = [] else: #empty dataset?! There are some... size = 0 types = [] if old_ds: #we will need to update the existing one old_ds.access_http=('HTTPServer' in types) old_ds.access_gridftp=('GridFTP' in types) old_ds.access_opendap=('OPeNDAP'in types) else: db.add(Dataset(id=dataset['id'], version=int(metadata['version']), catalog=metadata['catalog'], state=metadata['state'], filecount=filecount, size=size, access_http=('HTTPServer' in types), access_gridftp=('GridFTP' in types), access_opendap=('OPeNDAP' in types), modtime=dummy_date, parent_gateway=gw_server, parent_id=col)) if counter > 20: db.commit() counter = 0 else: counter += 1 # db.commit() #jfp temporary extra commit, to aid debugging if col in db_col: print col, len(db_col[col].datasets), len(existing_ds) hlog.info( "collection,lengths: %s, %s, %s on %s" %\ ( col, len(db_col[col].datasets), len(existing_ds), time.ctime() ) ) for dataset in db_col[col].datasets: ds_key = '%s#%s' % (dataset.id, dataset.version) if not ds_key in existing_ds: print "dataset %s was deleted" % ds_key hlog.info( "dataset %s was deleted" % ds_key ) db.delete(dataset) #if dataset.state == 'published': #dataset.state = 'retracted' #commit the rest of the changes db.commit()
def processGateway(gw_name): import urllib2, re, xml, gateway db = getSession() gw_url = gateway.getGatewayInfo()[gw_name]['url'] gw_server = gateway.getGatewayInfo()[gw_name]['server'] try: cmip5_id = _getCMIP5Id(gw_url) except: print 'No CMIP5 found for Gateway %s. Check manually.' % gw_name return #get all toplevel collections from gateway gw_top = {} for tlc in gateway.main(('-g %s -co' % gw_name).split()): gw_top[tlc['id']] = tlc #get already known collections db_col = {} for col in db.query(Collection).filter(Collection.gateway==gw_server).all(): #within the gateway these are unique db_col[col.id] = col db_ds = {} for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all(): db_ds[ds.id] = ds #now get all CMIP5 datasets page = urllib2.urlopen('%s/project/%s/dataset/feed.atom' % (gw_url, cmip5_id)).read() dom = xml.dom.minidom.parseString(page) counter = 0 #commit after a bunch existing_ds = {} for entry in dom.getElementsByTagName('entry'): id = entry.getElementsByTagName('title')[0].childNodes[0].data timestamp = entry.getElementsByTagName('updated')[0].childNodes[0].data last_update = datetime.datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:6]) #top level and normal are mixed! if id in gw_top: #this is a top level for cmip5! print "Top level found", id if id in db_col: #update col = db_col[id] if last_update > col.modtime: #we know this collection was modified! (not that we car now...) print "Collection modifed! was %s now is %s" % (col.modtime, last_update) col.modtime = last_update else: #add new collection metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split()) db.add(Collection(gateway=gw_server,id=id,modtime=last_update,state=metadata['state'])) continue #remember this dataset for later existing_ds[id] = True if id in db_ds: #we know this normal dataset! Check if it has changed if db_ds[id].modtime == last_update: #old news... continue print "Changed dataset found", id, db_ds[id].modtime, last_update #something got changed! old_ds = db_ds[id] else: print "New dataset found", id old_ds = None #new dataset version or something changed! metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split()) #version work around if metadata['state'] == 'retracted': print "retracted dataset" #this got retracted! if old_ds and old_ds.state != metadata['state']: #state changed! old_ds.state = metadata['state'] continue if not metadata['catalog'] or not metadata['version']: print "Can't parse this, no catalog or version!!", metadata continue files = gateway.main(('-g %s --parent %s -fo' % (gw_name, id)).split()) filecount = len(files) if filecount > 0: size = sum([int(f['size']) for f in files]) #we assume this is per dataset defined, and not per file ep = files[filecount/2]['endpoints'] if ep: types = [e['type'] for e in ep] else: types = [] else: #empty dataset?! There are some... size = 0 types = [] if old_ds and int(metadata['version']) == old_ds.version: print "Same version was updated!!" to_check_update = [('access_http', 'HTTPServer' in types), ('access_gridftp', 'GridFTP' in types), ('access_opendap', 'OPeNDAP'in types), ('filecount', filecount), ('size', size)] for var, value in to_check_update: report = "" old_value = old_ds.__dict__[var] if old_value != value: #report and update report += "Changed %s from %s to %s, " % (var, old_value, value) old_ds.__dict__[var] = value continue #Use old_ds instead of creating a new one. elif old_ds: #new version print "New version found %s, last one was %s" % (metadata['version'], old_ds.version) #Definitely a new version of either an existing dataset or a new one. db.add(Dataset(id=id, version=int(metadata['version']), catalog=metadata['catalog'], state=metadata['state'], filecount=filecount, size=size, access_http=('HTTPServer' in types), access_gridftp=('GridFTP' in types), access_opendap=('OPeNDAP' in types), modtime=last_update, parent_gateway=gw_server)) if counter > 20: #db.commit() counter = 0 else: counter += 1 #db.commit() #Now we must find missing ones, so we delete them properly for col in db_col.values(): for dataset in col.datasets: if not dataset.id in existing_ds: if dataset.state == 'published': dataset.state = 'retracted' print "dataset %s was removed" % dataset.id #db.commit() datasets = {} for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server): for dataset in gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split()): datasets[dataset['id']] = col.id for d in db.new: if d.id in datasets: d.parent_id = datasets[d.id] else: print "problem with", d db.commit()