Пример #1
0
    def test_main(self):
        loop_stub = Mock()
        router_stub = Mock()
        radio_stub = Mock()

        gateway.Router.return_value = router_stub
        gateway.Radio.return_value = radio_stub
        gateway.Radio.send_packet = {}
        gateway.asyncio.get_event_loop.return_value = loop_stub
        router_stub.connect_to_message_queue.return_value = 'Future'

        gateway.main()

        gateway.initialize_gpio.assert_called_once_with()
        gateway.Radio.assert_called_once_with()

        gateway.Router.assert_called_once_with()
        loop_stub.run_until_complete.assert_called_once_with('Future')
        router_stub.set_send_packet.assert_called_once_with(
            radio_stub.send_packet)

        gateway.poll.assert_called_once_with(loop_stub, radio_stub,
                                             router_stub)

        loop_stub.run_forever.assert_called_once_with()
        loop_stub.close.assert_called_once_with()
Пример #2
0
    def test_main(self):
        loop_stub = Mock()
        router_stub = Mock()
        radio_stub = Mock()

        gateway.Router.return_value = router_stub
        gateway.Radio.return_value = radio_stub
        gateway.Radio.send_packet = {}
        gateway.asyncio.get_event_loop.return_value = loop_stub
        router_stub.connect_to_message_queue.return_value = 'Future'

        gateway.main()

        gateway.initialize_gpio.assert_called_once_with()
        gateway.Radio.assert_called_once_with()

        gateway.Router.assert_called_once_with()
        loop_stub.run_until_complete.assert_called_once_with('Future')
        router_stub.set_send_packet.assert_called_once_with(radio_stub.send_packet)

        gateway.poll.assert_called_once_with(loop_stub, radio_stub, router_stub)

        loop_stub.run_forever.assert_called_once_with()
        loop_stub.close.assert_called_once_with()
Пример #3
0
def testIPSL():
    import gateway   

    #connect to db
    db = ReplicaDB('sqlite:///ipsl2.db')

    #get datasets for replication
    datasets = ["cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.day.atmos.cfDay.r1i1p1","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.day.atmos.day.r1i1p1","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.fx.atmos.fx.r0i0p0","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.fx.land.fx.r0i0p0","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.fx.ocean.fx.r0i0p0","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.3hr.atmos.3hr.r1i1p1","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.6hr.atmos.6hrLev.r1i1p1","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.6hr.atmos.6hrPlev.r1i1p1"]

    for ds in datasets:
        cmd = '-g BADC -of --parent {0}'.format(ds)
        files = gateway.main(cmd.split(' '))
        print len(files)
        print files[0]
        db.add_all(files)
Пример #4
0
 def setUp(self):
     self.config = testing.setUp()
     import ConfigParser
     from gateway.models import DBSession
     from gateway.models import initialize_sql
     config = ConfigParser.ConfigParser()
     config.readfp(open('testing.ini'))
     db_string = config.get('app:gateway', 'db_string')
     initialize_sql(db_string)
     from gateway import main
     from webtest import TestApp
     app = main(None, **{'db_string': db_string,
                         'mako.directories': config.get('app:gateway', 'mako.directories')})
     self.testapp = TestApp(app)
     self.session = DBSession()
Пример #5
0
def updateCollections(gw_name):
    import gateway
    gw_server = gateway.getGatewayInfo()[gw_name]['server']
    
    db = getSession()
    datasets = {}
    for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server):
        for dataset in gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split()):
            datasets[dataset['id']] = col.id
    
    #search for orphans of this gateway
    for dataset in db.query(Dataset).filter(Dataset.parent_gateway==gw_server).filter(Dataset.parent_id==None):
        if dataset.id in datasets:
            dataset.parent_id = datasets[dataset.id]
        else:
            #perhaps the top level collection was retracted!
            print "Dataset has no parent", dataset.id
    
    db.commit()
Пример #6
0
def processGateway(gw_name):
    import urllib2, re, xml, gateway
    db = getSession()
    gw_url = gateway.getGatewayInfo()[gw_name]['url']
    gw_server = gateway.getGatewayInfo()[gw_name]['server']
    
    hlog.debug("jfp gw_url=%s, gw_server=%s",gw_url,gw_server)
    try:
        cmip5_id = _getCMIP5Id(gw_url)
    except:
        if gw_name == 'NCI':
            hlog.warn("_getCMIP5Id failed; but recognize gateway and will use 'cmip5'")
            cmip5_id = 'cmip5'
        else:
            print 'No CMIP5 found for Gateway %s. Check manually.(3)' % gw_name
            return
    hlog.debug("jfp cmip5_id=%s",cmip5_id)
     
    #get all toplevel collections from gateway
    gw_top = {}
    hlog.debug("jfp in processGateway, will call gateway.main with args %s",
               ('-g %s -co' % gw_name).split())
    collections = gateway.main(('-g %s -co' % gw_name).split())
    if collections==None: collections=[]
    if collections==2:   # quick fix; gateway.main should throw an exception instead
        collections=[]
    for tlc in collections:
        gw_top[tlc['id']] = tlc
    
    #get already known collections
    db_col = {}
    for col in db.query(Collection).filter(Collection.gateway==gw_server).all():
        #within the gateway these are unique
        db_col[col.id] = col
    hlog.debug("jfp db_col=%s",db_col)
    
    db_ds = {}
    for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all():
        db_ds[ds.id] = ds
    
    #now get all CMIP5 datasets
    page_url = '%s/project/%s/dataset/feed.atom' % (gw_url, cmip5_id)
    hlog.debug("jfp in processGateway, about to open %s",page_url)
    # jfp 2012.09.11 This url is unavailable at NCI; the harvest will always fail.
    #    if page_url.find('nci.org.au')>-1:  # jfp unreliable site, try the url twice (doesn't help)
    #        try:
    #            print "first attempt..."
    #            page = urllib2.urlopen(page_url,None,120).read()
    #        except Exception as e:
    #            print "...failed: ",e
    #        print "second attempt..."
    try:
        page = urllib2.urlopen(page_url).read()
    except Exception as e:
        print "exception opening %s in processGateway: %s" % (page_url,e)
        raise e
    dom = xml.dom.minidom.parseString(page)
    counter = 0 #commit after a bunch
    existing_ds = {}
    hlog.debug("jfp %s dom entries",len(dom.getElementsByTagName('entry')))
    for entry in dom.getElementsByTagName('entry'):
        id = entry.getElementsByTagName('title')[0].childNodes[0].data
        timestamp = entry.getElementsByTagName('updated')[0].childNodes[0].data
        last_update = datetime.datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:6])
        #top level and normal are mixed!
        if id in gw_top: 
            #this is a top level for cmip5!
            print "Top level found", id 
            if id in db_col:
                #update
                col = db_col[id]
                if last_update > col.modtime:
                    #we know this collection was modified! (not that we car now...)
                    print "Collection modifed! was %s now is %s" % (col.modtime, last_update)
                    col.modtime = last_update
            else:
                #add new collection
                metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split())
                if metatdata==None: continue
                db.add(Collection(gateway=gw_server,id=id,modtime=last_update,state=metadata['state']))
            continue

        #remember this dataset for later
        existing_ds[id] = True

        if id in db_ds:
            #we know this normal dataset! Check if it has changed
            if  db_ds[id].modtime == last_update:
                #old news...
                hlog.debug("Unchanged dataset %s, modtime=%s",id,last_update)
                continue
            print "Changed dataset found", id, db_ds[id].modtime, last_update 
            hlog.info( "Changed dataset found %s %s %s", id, db_ds[id].modtime, last_update )
            #something got changed!
            old_ds = db_ds[id]
            old_ds.modtime = last_update
        else:
            print "New dataset found", id, " on ", time.ctime()
            hlog.info( "New dataset found %s on %s" %(id,time.ctime()) )
            old_ds = None
        #new dataset version or something changed!
        metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split())
        if metadata==None or metadata==2: continue
        hlog.debug("version %s",metadata['version'])
         
        #version work around
        if metadata['state'] == 'retracted':
            print "retracted dataset"
            #this got retracted!
            if old_ds and old_ds.state != metadata['state']:
                #state changed!
                old_ds.state = metadata['state']
            continue
        if not metadata['catalog'] or not metadata['version']:
            print "Can't parse this, no catalog or version!!", metadata
            continue
            
        files = gateway.main(('-g %s --parent %s -fo' % (gw_name, id)).split())
        if files==None: files=[]
        filecount = len(files)
        if filecount > 0:
            size = sum([int(f['size']) for f in files])
            #we assume this is per dataset defined, and not per file
            ep = files[filecount/2]['endpoints']
            if ep:
                types = [e['type'] for e in ep]
            else:
                types = []
        else:
            #empty dataset?! There are some...
            size = 0
            types = []

        if old_ds and int(metadata['version']) == old_ds.version:
            print "Same version was updated!!"

            to_check_update = [('access_http', 'HTTPServer' in types),
                ('access_gridftp', 'GridFTP' in types), ('access_opendap', 'OPeNDAP'in types),
                ('filecount', filecount), ('size', size)]
            for var, value in to_check_update:
                report = ""
                old_value = old_ds.__dict__[var]
                if old_value != value:
                    #report and update
                    report += "Changed %s from %s to %s, " % (var, old_value, value)
                    old_ds.__dict__[var] = value
            continue    #Use old_ds instead of creating a new one.
        elif old_ds:
            #new version
            print "New version found %s, last one was %s; on %s" %\
               (metadata['version'], old_ds.version, time.ctime())
            hlog.info( "New version found %s, last one was %s; on %s" %\
               (metadata['version'], old_ds.version, time.ctime()) )
        
        #Definitely a new version of either an existing dataset or a new one.
        try:  #jfp added try/except
            db.add(Dataset(id=id, version=int(metadata['version']), catalog=metadata['catalog'],\
                           state=metadata['state'], filecount=filecount, size=size, access_http=\
                           ('HTTPServer' in types), access_gridftp=('GridFTP' in types),\
                           access_opendap=('OPeNDAP' in types), modtime=last_update, parent_gateway=gw_server))
            db.flush()  # jfp will slow things down, but we'll catch problems right away
        except sqlalchemy.exc.IntegrityError:   #jfp added try/except
            print "exception adding dataset id=",id," version=",metadata['version']," state=",metadata['state']
            print "catalog=",metadata['catalog']," modtime=",last_update," parent_gateway=",gw_server
            print "access_http=",('HTTPServer' in types)," access_gridftp=",('GridFTP' in types)
            print sys.exc_info()[:2]
            db.rollback()  # jfp mandatory after a failed flush!
            # raise  #jfp Now should be able to continue with other datasets.
        if counter > 20:
            #db.commit()
            counter = 0
        else:
            counter += 1
    #db.commit()

    #Now we must find missing ones, so we delete them properly
    for col in db_col.values():
        for dataset in col.datasets:
            if not dataset.id in existing_ds:
                if dataset.state == 'published':
                    dataset.state = 'retracted'
                print "dataset %s was removed by %s" % (dataset.id,time.ctime())
                hlog.info( "dataset %s was removed by %s" %(dataset.id,time.ctime()) )
    #db.commit()
    # print "jfp finished with loop over db_col.values()"
    datasets = {}
    for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server):
        gdatasets = gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split())
        if gdatasets==None: gdatasets=[]
        for dataset in gdatasets:
            datasets[dataset['id']] = col.id
    
    for d in db.new:
        if d.id in datasets:
            d.parent_id = datasets[d.id]
        else:
            print "problem with", d
    db.commit()
    print "jfp committed"
Пример #7
0
def processGatewayOld(gw_name, fast=True):
    """Old method for harvesting gateways"""
    import urllib2, re, xml, gateway
    db = getSession()
    gw_url = gateway.getGatewayInfo()[gw_name]['url']
    gw_server = gateway.getGatewayInfo()[gw_name]['server']
    #skip these
    skip_top_level = []

    try:
        cmip5_id = _getCMIP5Id(gw_url)
    except:
        print 'No CMIP5 found for Gateway %s. Check manually.(2)' % gw_name
        hlog.warn( 'No CMIP5 found for Gateway %s. Check manually.(2)' % gw_name )
        return
    
    #get already known collections
    db_col = {}
    for col in db.query(Collection).filter(Collection.gateway==gw_server).all():
        #within the gateway these are unique
        db_col[col.id] = col

    #now get known datasets
    db_ds = {}
    for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all():
        db_ds['%s#%s' % (ds.id, ds.version)] = ds

    counter = 0
    for col in _getCMIP5Collections(gw_url, cmip5_id):
        hlog.info( "Processing Collection %s on %s" %(col,time.ctime()) )
        if col in skip_top_level:
            print "Skipping"
            hlog.info( "Skipping, time is %s" % (time.ctime()) )
            continue

        if not col in db_col:
            #new collection!
            hlog.info("New collection %s on %s" % (col,time.ctime()))
            md = gateway.main(('-g %s --parent %s -mo' % (gw_name, col)).split())
            if md==None: continue
            #use a fictional date for the update so we know later on which
            #should be latered
            db.add(Collection(id=col, gateway=gw_server,state=md['state'],
                modtime=dummy_date))

        existing_ds = {}
        datasets = gateway.main(('-g %s --parent %s -do' % (gw_name,col)).split())
        if datasets==None: datasets=[]
        for dataset in datasets:
            ds_key = '%s#%s' % (dataset['id'], dataset['version'])

            #store for later
            existing_ds[ds_key] = True

            if ds_key in db_ds:
                old_ds = db_ds[ds_key]
                #should we update? (for now don't...)
                #if int(dataset['version']) == old_ds.version:
                    #same version... we might want to check... but in the common case this won't be necessary
                    #and is extremely expensive for this old way of getting things
                #    continue
            else:
                old_ds = None

            #Avoid reparsing already parsed datasets. The might change! e.g. they can be retracted.
            #They should be parsed once in a while
            if fast and old_ds: continue

            print "Processing dataset", ds_key, " on ", time.ctime()
	    hlog.info( "Processing dataset %s on %s" %(ds_key,time.ctime()) )
            metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, dataset['id'])).split())
            if not metadata:
                continue
            #version work around 
            if metadata['state'] == 'retracted':
                print "retracted dataset"
                hlog.info( "retracted dataset" )
                #this got retracted!
                if old_ds and old_ds.state != metadata['state']:
                    #state changed!
                    old_ds.state = metadata['state']
                continue
            if not metadata['catalog'] or not metadata['version']:
                print "Can't parse this, no catalog or version!!", metadata
                hlog.info( "Can't parse this, no catalog or version!! %s" %s (metadata) )
                continue


            #this is new!
            files = gateway.main(('-g %s --parent %s -fo' % (gw_name,dataset['id'])).split())
            if files==None: files=[]
            filecount = len(files)
            if filecount > 0:
                size = sum([int(f['size']) for f in files])
                #we assume this is per dataset defined, and not per file
                # use some file in the middle for this
                ep = files[filecount/2]['endpoints']
                if ep:
                    types = [e['type'] for e in ep]
                else:
                    types = []
            else:
                #empty dataset?! There are some...
                size = 0
                types = []
            if old_ds:
                #we will need to update the existing one
                old_ds.access_http=('HTTPServer' in types)
                old_ds.access_gridftp=('GridFTP' in types)
                old_ds.access_opendap=('OPeNDAP'in types)
                 
            else:
                db.add(Dataset(id=dataset['id'], version=int(metadata['version']), catalog=metadata['catalog'], 
                state=metadata['state'], filecount=filecount, size=size, access_http=('HTTPServer' in types),
                access_gridftp=('GridFTP' in types), access_opendap=('OPeNDAP' in types), 
                modtime=dummy_date, parent_gateway=gw_server, parent_id=col))
            if counter > 20:
                db.commit()
                counter = 0
            else:
                counter += 1
            # db.commit() #jfp temporary extra commit, to aid debugging


        if col in db_col:
            print col, len(db_col[col].datasets), len(existing_ds)
            hlog.info( "collection,lengths: %s, %s, %s on %s" %\
                       ( col, len(db_col[col].datasets), len(existing_ds), time.ctime() ) )
            for dataset in db_col[col].datasets:
                ds_key = '%s#%s' % (dataset.id, dataset.version)
                if not ds_key in existing_ds:
                    print "dataset %s was deleted" % ds_key
                    hlog.info( "dataset %s was deleted" % ds_key )
                    db.delete(dataset)
                    #if dataset.state == 'published':
                        #dataset.state = 'retracted'

    #commit the rest of the changes
    db.commit()
Пример #8
0
def main(argv=None):
    
    if argv is None: argv = sys.argv[1:]
    import getopt
    try:
        args, lastargs = getopt.getopt(argv, "g:D:e:dvqh", ['help', 'gateway-url=', 'parent='])
    except getopt.error:
        print sys.exc_info()[:3]
        return 1

    #init values
    db_name = 'replica.db'
    gatewayURL = gatewayName = regex = None
    parent_set = False
    gw_args = "-o"
    #parse arguments
    for flag, arg in args:
        if flag=='-g':              gw_args = '%s -g %s' % (gw_args, arg)
        elif flag=='--gateway-url': gw_args = '%s --gateway-url %s' % (gw_args, arg)
        elif flag=='--parent': 
            gw_args = '%s --parent %s' % (gw_args, arg)
            parent_set = True
        elif flag=='-D':            db_name = arg
        elif flag=='-e':            
            import re
            regex = re.compile(arg)

#        elif flag=='-x':            retrieve_xml = True
#        elif flag=='-o':            retrieve_object = True

        elif flag=='-d':            log.setLevel(logging.DEBUG)
        elif flag=='-v':            log.setLevel(logging.INFO)
        elif flag=='-q':            log.setLevel(logging.NONE)
        elif flag=='-h' or flag=='--help': return 1
        
    if not parent_set: 
        gw_args = '-A ' + gw_args
        log.warn('Top level collection not set. Trying all known top-level collections (set with --parent)')
    else: gw_args = '-d ' + gw_args

    
    #Get datasets
    log.info('Retrieving datasets from Gateway')
    log.debug('cmd: %s', gw_args)
    ds = gateway.main(gw_args.split(' '))

    if ds: log.debug('Total datasets: %s', len(ds))
    else: 
        log.error('No Dataset was found!')
        return 0
    
    if regex:
        ds = [d['id'] for d in ds if regex.search(d['id'])]

    #prepare DB
    log.debug('DB: %s', db_name)
    log.debug('Entries: %s', len(ds))
    db = ReplicaDB('sqlite:///%s' % db_name)
    db.open()
    for d in ds:
        cmd = '{0} -of --parent {1}'.format(gw_args[3:], d)
        log.debug('getting files with %s', cmd)
        files = gateway.main(cmd.split(' '))
        if files:
            log.info('Adding %s files for dataset %s', len(files), d)
            db.add_all(files)
        else:
            log.error('no file found!')


    return 0
Пример #9
0
def processGateway(gw_name):
    import urllib2, re, xml, gateway
    db = getSession()
    gw_url = gateway.getGatewayInfo()[gw_name]['url']
    gw_server = gateway.getGatewayInfo()[gw_name]['server']
    
    try:
        cmip5_id = _getCMIP5Id(gw_url)
    except:
        print 'No CMIP5 found for Gateway %s. Check manually.' % gw_name
        return
     
    #get all toplevel collections from gateway
    gw_top = {}
    for tlc in gateway.main(('-g %s -co' % gw_name).split()):
        gw_top[tlc['id']] = tlc
    
    #get already known collections
    db_col = {}
    for col in db.query(Collection).filter(Collection.gateway==gw_server).all():
        #within the gateway these are unique
        db_col[col.id] = col
    
    db_ds = {}
    for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all():
        db_ds[ds.id] = ds
    
    #now get all CMIP5 datasets
    page = urllib2.urlopen('%s/project/%s/dataset/feed.atom' % (gw_url, cmip5_id)).read()
    dom = xml.dom.minidom.parseString(page)
    counter = 0 #commit after a bunch
    existing_ds = {}
    for entry in dom.getElementsByTagName('entry'):
        id = entry.getElementsByTagName('title')[0].childNodes[0].data
        timestamp = entry.getElementsByTagName('updated')[0].childNodes[0].data
        last_update = datetime.datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:6])
        #top level and normal are mixed!
        if id in gw_top: 
            #this is a top level for cmip5!
            print "Top level found", id 
            if id in db_col:
                #update
                col = db_col[id]
                if last_update > col.modtime:
                    #we know this collection was modified! (not that we car now...)
                    print "Collection modifed! was %s now is %s" % (col.modtime, last_update)
                    col.modtime = last_update
            else:
                #add new collection
                metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split())
                db.add(Collection(gateway=gw_server,id=id,modtime=last_update,state=metadata['state']))
            continue

        #remember this dataset for later
        existing_ds[id] = True

        if id in db_ds:
            #we know this normal dataset! Check if it has changed
            if  db_ds[id].modtime == last_update:
                #old news...
                continue
            print "Changed dataset found", id, db_ds[id].modtime, last_update 
            #something got changed!
            old_ds = db_ds[id]
        else:
            print "New dataset found", id
            old_ds = None
        #new dataset version or something changed!
        metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split())
            
         
        #version work around
        if metadata['state'] == 'retracted':
            print "retracted dataset"
            #this got retracted!
            if old_ds and old_ds.state != metadata['state']:
                #state changed!
                old_ds.state = metadata['state']
            continue
        if not metadata['catalog'] or not metadata['version']:
            print "Can't parse this, no catalog or version!!", metadata
            continue
            
        files = gateway.main(('-g %s --parent %s -fo' % (gw_name, id)).split())
        filecount = len(files)
        if filecount > 0:
            size = sum([int(f['size']) for f in files])
            #we assume this is per dataset defined, and not per file
            ep = files[filecount/2]['endpoints']
            if ep:
                types = [e['type'] for e in ep]
            else:
                types = []
        else:
            #empty dataset?! There are some...
            size = 0
            types = []
        if old_ds and int(metadata['version']) == old_ds.version:
            print "Same version was updated!!"

            to_check_update = [('access_http', 'HTTPServer' in types),
                ('access_gridftp', 'GridFTP' in types), ('access_opendap', 'OPeNDAP'in types),
                ('filecount', filecount), ('size', size)]
            for var, value in to_check_update:
                report = ""
                old_value = old_ds.__dict__[var]
                if old_value != value:
                    #report and update
                    report += "Changed %s from %s to %s, " % (var, old_value, value)
                    old_ds.__dict__[var] = value
            continue    #Use old_ds instead of creating a new one.
        elif old_ds:
            #new version
            print "New version found %s, last one was %s" %  (metadata['version'], old_ds.version)
        
        #Definitely a new version of either an existing dataset or a new one.
        db.add(Dataset(id=id, version=int(metadata['version']), catalog=metadata['catalog'], state=metadata['state'],
                filecount=filecount, size=size, access_http=('HTTPServer' in types), access_gridftp=('GridFTP' in types),
                access_opendap=('OPeNDAP' in types), modtime=last_update, parent_gateway=gw_server))
        if counter > 20:
            #db.commit()
            counter = 0
        else:
            counter += 1
    #db.commit()
    
    #Now we must find missing ones, so we delete them properly
    for col in db_col.values():
        for dataset in col.datasets:
            if not dataset.id in existing_ds:
                if dataset.state == 'published':
                    dataset.state = 'retracted'
                print "dataset %s was removed" % dataset.id
    #db.commit()
    datasets = {}
    for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server):
        for dataset in gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split()):
            datasets[dataset['id']] = col.id
    
    for d in db.new:
        if d.id in datasets:
            d.parent_id = datasets[d.id]
        else:
            print "problem with", d
    db.commit()