def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl)) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): yield record
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl) ) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): yield record
def test(request): URL = 'http://www.kulturarv.dk/ffrepox/OAIHandler' registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry) identifyResponse = client.identify() print dir(identifyResponse) #for record in client.listRecords(metadataPrefix='oai_dc'): # result += record return HttpResponse(identifyResponse.repositoryName())
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl)) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): # Unit test hotfix header, metadata, about = record # Fix pyoai returning a "b'...'" string for py3k if isinstance(metadata, str) and metadata.startswith("b'"): metadata = ast.literal_eval(metadata).decode("utf-8") yield (header, metadata, about)
def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix client = Client(baseUrl, self._mdRegistry) # Check that baseUrl actually represents an OAI-PMH target try: client.identify() except IndexError: raise NotOAIPMHBaseURLException( "{0} does not appear to be an OAI-PMH compatible base URL" "".format(baseUrl) ) # Check server timestamp granularity support client.updateGranularity() for record in client.listRecords(**kwargs): # Unit test hotfix header, metadata, about = record # Fix pyoai returning a "b'...'" string for py3k if isinstance(metadata, str) and metadata.startswith("b'"): metadata = ast.literal_eval(metadata).decode("utf-8") yield (header, metadata, about)
def clean(self): cleaned_data = super(CreateRepositoryForm, self).clean() try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(cleaned_data.get('base_url'), registry) server = client.identify() # set the repository name apply to model instance when saved. cleaned_data['name'] = server.repositoryName() except: raise ValidationError('Repository base url is invalid.') return cleaned_data
def checkProvider(self, url): """ Check OAI-PMH provider. A valid Identity response, is considered is considered as provider online. An exception is considered provider offline """ try: client = Client(url) ident = client.identify() self.log.debug("Service at: " + url + " is responding") self.log.debug("RepositoryName is: " + ident.repositoryName()) self.log.debug("BaseURL is: " + ident.baseURL()) return True except Exception as e: self.log.error("Problem with server at: " + url + "\n") #,exc_info=True) return False
'oai_oi': 'http://www.openbeelden.nl/feeds/oai/', #'http://www.openarchives.org/OAI/2.0/oai_oi/', 'oi': 'http://www.openbeelden.nl/oai/'} ) #URL = 'http://www.openbeelden.nl/oip-test/feeds/oai/' URL = 'http://www.openbeelden.nl/feeds/oai/' #URL = 'http://oai.tuxic.nl/oai/' #Initieer de OAI client registry = MetadataRegistry() registry.registerReader('oai_oi', oai_oi_reader) client = Client(URL, registry) x = client.updateGranularity() #Controleer of de OAI service goed geidentificeerd kan worden x = client.identify() print 'identity %s' % x.repositoryName() print 'identity %s' % x.protocolVersion() print 'identity %s' % x.baseURL() OUTPUT_DIR = '/Users/jblom/temp' print 'Firing up the openSKOSHandler' osh = OpenSKOSHandler() def processOpenbeelden(): i=0 iarecs = [] #for y in client.listRecords(metadataPrefix='oai_oi', from_=parse('2011-01-01'), until=parse('2011-11-01')): extent = None
URL = "http://citeseerx.ist.psu.edu/oai2" registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry) client.updateGranularity() store = Store() if len(sys.argv) > 1: start = datetime.strptime(sys.argv[1], '%Y-%m-%d') #2011-10-27, for instance elif store.last(): start = store.last() else: start = client.identify().earliestDatestamp() #try this and see if it works; if it does resumption tokens right, this should work fine. chunk = timedelta(days=1) oneday = timedelta(days=1) #TODO: clearly they don't do this whole "ordered" thing. Grab records by month or year or something instead of all at once. #TODO: luckily, once we've done a full slurp, we only need to remember when the last full slurp was and start since then. But if interrupted, we need to start back from where the last *full* slurp was, due to the ordering problem. #TODO: structure this better, with the try effectively moved much further above. Really, move a lot more into functions try: current = start #TODO: make a nice little generator so I can use a for loop while current <= datetime.now(): print >>sys.stderr, "fetching records @", now(), "starting with", current.strftime('%Y-%m-%d')
def run(self): # Check that ElasticSearch is alive self.check_index() # If the user specified the --REBUILD flag, recreate the index if self.options['rebuild']: self.rebuild_index() # Connect to the repository registry = MetadataRegistry() registry.registerReader(self.settings["metadata_format"], self.settings["metadata_reader"]) client = Client(self.settings["uri"], registry) identity = client.identify() print "Connected to repository: %s" % identity.repositoryName() # got to update granularity or we barf with: # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z client.updateGranularity() # Initialise some variables batcher = Batch.Batch() total_records = 0 start = time.time() # Now do the synchonisation # If the user specified an identifier, then synchronise this record if (self.options['identifier'] is not None): total_records += self.synchronise_record(client, batcher, self.options['identifier']) else: # Else, synchronise using the date-range provided by the user, or failing that, # the date-range based on the last sync # Get the synchronisation config record synchronisation_config = self.get_synchronisation_config() if self.options["from_date"] is not None: # If the user specified a from-date argument, use it from_date = self.options["from_date"] # already a date (not a datetime) elif synchronisation_config is not None and "to_date" in synchronisation_config: # Else read the last synchronised to_date from the config, and add on a day from_date = dateutil.parser.parse(synchronisation_config["to_date"]).date() + timedelta(days=1) else: # Else use the default_from_date in the config from_date = dateutil.parser.parse(self.settings['default_from_date']).date() if self.options["to_date"] is not None: to_date = self.options["to_date"] # already a date (not a datetime) else: to_date = (date.today() - timedelta(days=1)) # Force the from_date to use time 00:00:00 from_date = datetime.combine(from_date, _time(hour=0, minute=0, second=0, microsecond=0)) # Force the to_date to use time 23:59:59 to_date = datetime.combine(to_date, _time(hour=23, minute=59, second=59, microsecond=0)) print "Synchronising from %s - %s" % (from_date, to_date) while from_date < to_date: next_date = datetime.combine(from_date.date() + timedelta(days=(self.settings['delta_days'] - 1)), _time(hour=23, minute=59, second=59, microsecond=0)) number_of_records = self.synchronise_period(client, batcher, from_date, next_date) batcher.clear() #Store the records in elasticsearch self.put_synchronisation_config(from_date, next_date, number_of_records) from_date += timedelta(days=(self.settings['delta_days'])) total_records += number_of_records # Pause so as not to get banned. to = 20 print "Sleeping for %i seconds so as not to get banned." % to time.sleep(to) # Store the records in the index batcher.clear() # Print out some statistics time_spent = time.time() - start print 'Total time spent: %d seconds' % (time_spent) if time_spent > 0.001: # careful as its not an integer print 'Total records synchronised: %i records (%d records/second)' % (total_records, (total_records/time_spent)) else: print 'Total records synchronised: %i records' % (total_records) return total_records sys.exit()
def add_provider(cxn, args): """Add a new provider to the registry database. Process ``args`` to add a new provider to the registry database. Return 0 for success, 1 for failure (error message should be logged). ``cxn`` => instance of ``sqlite3.Connection`` ``args`` => instance of ``argparse.Namespace`` """ global logger, MAX_NAME_LENGTH addlogger = logger.getChild('add') # Validate name if len(args.name) > MAX_NAME_LENGTH: addlogger.critical('Short name for new provider must be no more than ' '{0} characters long'.format(MAX_NAME_LENGTH)) return 1 elif args.name.startswith(('http://', 'https://')) or args.name == 'all': addlogger.critical('Short name for new provider may not be "all" nor ' 'may it begin "http://" or "https://"') return 1 # Try to create row now to avoid unnecessary validation if duplicate try: cxn.execute("INSERT INTO providers(name, lastHarvest) values " "(?, ?)", (args.name, datetime.fromtimestamp(0)) ) except sqlite3.IntegrityError: addlogger.critical('Unable to add provider "{0}"; ' 'provider with this name already exists' ''.format(args.name) ) return 1 else: addlogger.info('Adding provider "{0}"'.format(args.name)) # Get any missing information # Base URL if args.url is None: args.url = raw_input('Base URL:'.ljust(20)) if not args.url: addlogger.critical('Base URL for new provider not supplied') return 1 # Set up an OAI-PMH client for validating providers md_registry = MetadataRegistry() md_registry.registerReader('oai_dc', oai_dc_reader) client = Client(args.url, md_registry) # Validate Base URL by fetching Identify try: client.identify() except (XMLSyntaxError, HTTPError): addlogger.critical('Base URL for new provider does not return a valid ' 'response to an `Identify` request') return 1 # Destination if args.dest is None: args.dest = raw_input('Destination directory: '.ljust(20)) if args.dest: # Expand user dir args.dest = os.path.expanduser(args.dest) else: addlogger.info('Destination for data for new provider not supplied' ' using default `pwd`: {0}'.format(os.getcwd()) ) args.dest = os.getcwd() # metadataPrefix # Check that selected metadataPrefix is available from provider # Fetch list of available formats mdps = dict((mdpinfo[0], mdpinfo[1:]) for mdpinfo in client.listMetadataFormats()) while args.metadataPrefix not in mdps: print "Available metadataPrefix values:" # List available formats for mdp in mdps: print mdp, '-', mdps[mdp][1] args.metadataPrefix = raw_input('metadataPrefix [oai_dc]:'.ljust(20)) if not args.metadataPrefix: addlogger.info('metadataPrefix for new provider not supplied. ' 'using default: oai_dc') args.metadataPrefix = 'oai_dc' cxn.execute("UPDATE providers SET " "url=?, " "destination=?, " "metadataPrefix=? " "WHERE name=?", (args.url, args.dest, args.metadataPrefix, args.name ) ) addlogger.info('URL for next harvest: {0}?verb=ListRecords' '&metadataPrefix={1}' '&from={2:%Y-%m-%dT%H:%M:%SZ%z}' ''.format(args.url, args.metadataPrefix, datetime.fromtimestamp(0) ) ) # All done, commit database cxn.commit() return 0
class Repository(object): """ Repository handles interaction with the various interfaces provided by the dspace repository. """ def __init__(self, url=None, **kwargs): self.base_url = kwargs.pop('base_url', None) self.oai_path = kwargs.pop('oai_path', None) self.oai_enabled = bool(kwargs.pop('oai_enabled', True)) self.sword_enabled = bool(kwargs.pop('sword_enabled', False)) if url is not None: warn( 'The url paramater will not be supported in version 3, ' 'use base_url and oai_path instead', DeprecationWarning) if (self.base_url and url.startswith(self.base_url) and self.oai_path is None): self.oai_path = url.replace(self.base_url, '', 1).lstrip('/') elif not self.base_url: if self.oai_path is None: self.oai_path = 'dspace-oai/request' if url.endswith(self.oai_path): self.base_url = url[:-(len(self.oai_path) + 1)] if self.base_url is None: raise ValueError('base_url argument must be specified') if not 'metadata_registry' in kwargs: kwargs['metadata_registry'] = MetadataRegistry() kwargs['metadata_registry'].registerReader('mets', dspace_mets_reader) if self.sword_enabled: skwargs = {'base_url': self.base_url} for key in kwargs.keys(): if key.startswith('sword_'): skwargs[key[6:]] = kwargs.pop(key) self.sword = SwordService(**skwargs) if self.oai_enabled: self.oai = Client('/'.join(( self.base_url, self.oai_path, )), **kwargs) self.identifier_base = self._extractIdentifierBase(self.base_url) def _extractIdentifierBase(self, url): """ From a given URL, extract the OAI identifier base (hostname) """ return urlparse(url).hostname def _extractSet(self, handle): """ Determine the OAI set from a collection handle """ if not isinstance(handle, basestring): raise ValueError('Collection handles must be strings') return 'hdl_' + handle.replace('/', '_').replace(':', '_') def getName(self): """ Get the configured name of the repository """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' return self.oai.identify().repositoryName() def getCollections(self): """ Get a list of the collections in the repository """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' return map(lambda c: c[0:2], self.oai.listSets()) def getItemHandles(self, collection=None, **kw): """ Get item handles from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' for item in self.getItemIdentifiers(collection=collection, **kw): yield item.identifier().split(':', 2)[2] def getItemIdentifiers(self, collection=None, **kw): """ Get item identifiers from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kw.setdefault('metadataPrefix', 'mets') if collection: kw['set'] = self._extractSet(collection) return self.oai.listIdentifiers(**kw) def getItems(self, collection=None, **kw): """ Get full items from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kw.setdefault('metadataPrefix', 'mets') if collection: kw['set'] = self._extractSet(collection) return self.oai.listRecords(**kw) def getItem(self, handle=None, identifier=None, **kwargs): """ Get a single item from the OAI-PMH interface either by handle or identifier """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kwargs.setdefault('metadataPrefix', 'mets') if handle is None and identifier is None: raise ValueError('Either handle or identifier must be provided') if handle is not None: if identifier is not None: raise ValueError('Either a handle or identifier must be ' 'provided, not both') identifier = 'oai:%s:%s' % ( self.identifier_base, handle, ) return self.oai.getRecord(identifier=identifier, **kwargs) def getOAIItemIdentifier(self, handle): return 'oai:%s:%s' % (self._extractIdentifierBase( self.base_url), handle) def getSwordCollections(self): pass def getSwordCollection(self, args): pass
def identifiy(target): if target is not None: client = Client(target['url'], registry) identify = client.identify() return convert_identifiy(identify)
def add_provider(cxn, args): """Add a new provider to the registry database. Process ``args`` to add a new provider to the registry database. Return 0 for success, 1 for failure (error message should be logged). ``cxn`` => instance of ``sqlite3.Connection`` ``args`` => instance of ``argparse.Namespace`` """ global logger, MAX_NAME_LENGTH addlogger = logger.getChild('add') # Validate name if len(args.name) > MAX_NAME_LENGTH: addlogger.critical('Short name for new provider must be no more than ' '{0} characters long'.format(MAX_NAME_LENGTH)) return 1 elif args.name.startswith(('http://', 'https://')) or args.name == 'all': addlogger.critical('Short name for new provider may not be "all" nor ' 'may it begin "http://" or "https://"') return 1 # Try to create row now to avoid unnecessary validation if duplicate try: cxn.execute( "INSERT INTO providers(name, lastHarvest) values " "(?, ?)", (args.name, datetime.fromtimestamp(0))) except sqlite3.IntegrityError: addlogger.critical('Unable to add provider "{0}"; ' 'provider with this name already exists' ''.format(args.name)) return 1 else: addlogger.info('Adding provider "{0}"'.format(args.name)) # Get any missing information # Base URL if args.url is None: args.url = raw_input('Base URL:'.ljust(20)) if not args.url: addlogger.critical('Base URL for new provider not supplied') return 1 # Set up an OAI-PMH client for validating providers md_registry = MetadataRegistry() md_registry.registerReader('oai_dc', oai_dc_reader) client = Client(args.url, md_registry) # Validate Base URL by fetching Identify try: client.identify() except (XMLSyntaxError, HTTPError): addlogger.critical('Base URL for new provider does not return a valid ' 'response to an `Identify` request') return 1 # Destination if args.dest is None: args.dest = raw_input('Destination directory: '.ljust(20)) if args.dest: # Expand user dir args.dest = os.path.expanduser(args.dest) else: addlogger.info('Destination for data for new provider not supplied' ' using default `pwd`: {0}'.format(os.getcwd())) args.dest = os.getcwd() # metadataPrefix # Check that selected metadataPrefix is available from provider # Fetch list of available formats mdps = dict( (mdpinfo[0], mdpinfo[1:]) for mdpinfo in client.listMetadataFormats()) while args.metadataPrefix not in mdps: print "Available metadataPrefix values:" # List available formats for mdp in mdps: print mdp, '-', mdps[mdp][1] args.metadataPrefix = raw_input('metadataPrefix [oai_dc]:'.ljust(20)) if not args.metadataPrefix: addlogger.info('metadataPrefix for new provider not supplied. ' 'using default: oai_dc') args.metadataPrefix = 'oai_dc' cxn.execute( "UPDATE providers SET " "url=?, " "destination=?, " "metadataPrefix=? " "WHERE name=?", (args.url, args.dest, args.metadataPrefix, args.name)) addlogger.info('URL for next harvest: {0}?verb=ListRecords' '&metadataPrefix={1}' '&from={2:%Y-%m-%dT%H:%M:%SZ%z}' ''.format(args.url, args.metadataPrefix, datetime.fromtimestamp(0))) # All done, commit database cxn.commit() return 0
def retrieval(self, repository): self.logger.info(u'Trying to retrieve url {0}'.format(repository[1]).encode(ENCODE)) registry = MetadataRegistry() registry.registerReader(METADATA, oai_dc_reader) try: client = Client(repository[1], registry) self.logger.info(SEPARATOR) self.logger.info(u'Connection established successfully...') # identify info identify = client.identify() repository_name = identify.repositoryName() repository_name_normalized = re.sub(re.compile(FILE_ESCAPE_CHARS), '', repository_name).strip() \ .replace(' ', '_').lower() base_url = identify.baseURL().encode(ENCODE) protocol_version = identify.protocolVersion().encode(ENCODE) granularity = identify.granularity().encode(ENCODE) compression = identify.compression() deleted_record = identify.deletedRecord().encode(ENCODE) metadata = {'repository_name': repository_name, 'base_url': base_url, 'latest_url': repository[1], 'protocol_version': protocol_version, 'granularity': granularity, 'compression': str(compression).strip('[]'), 'deleted_record': deleted_record} self.logger.info(u'Repository name: {0}'.format(repository_name)) self.logger.info(u'URL connected: {0}'.format(repository[1])) self.logger.info(u'Base URL: {0}'.format(base_url)) self.logger.info(u'Protocol version: {0}'.format(protocol_version)) self.logger.info(u'Granularity: {0}'.format(granularity)) self.logger.info(u'Compression: {0}'.format(compression)) self.logger.info(u'Deleted record: {0}'.format(deleted_record)) records_count = 0 deleted_count = 0 records_list = list() parsed_records_list = list() # we're not interested in all sets, so we must iterate over the ones we have and want to crawl if repository[2] is not None: self.logger.info(u'Fetching set {0}...'.format(repository[2])) records_list = client.listRecords(metadataPrefix=METADATA, set=repository[2]) else: records_list = client.listRecords(metadataPrefix=METADATA) if records_list is not None: for record in records_list: records_count += 1 if record[0].isDeleted(): deleted_count += 1 if record[1] is not None: parsed_records_list.append(tostring(record[1].element())) self.logger.info( u'Retrieved {0} records from set {1} where {2} were deleted'.format(records_count, repository[2], deleted_count)) if not exists(''.join(['files/', repository_name_normalized, '/'])): self.logger.info('Creating storage folder for {0}...'.format(repository_name)) makedirs(''.join(['files/', repository_name_normalized, '/'])) self.logger.info(u'Creating storage files...') meta_file = open(''.join(['files/', repository_name_normalized, '/metadata.xml']), 'w') metadata[repository[2] + '_records_number'] = records_count metadata[repository[2] + '_deleted_number'] = deleted_count meta_file.write(tostring(dict_to_xml('metadata', metadata))) meta_file.close() record_file = open(''.join( ['files/', repository_name_normalized, '/', repository_name_normalized, '_', repository[2], '.xml']), 'w') record_file.write(''.join(parsed_records_list)) record_file.close() except NoRecordsMatchError, nrme: self.logger.error(u'{0} on repository {1}'.format(nrme.message, repository_name)) # add url to unvisited_url and ask retrieval to try to crawl them again if nrme.message == 'No matches for the query': self.unvisited_repository.append(repository)
class OpenBeeldenDataLoader(DataLoader): def __init__(self): self.ES_INDEX = 'et_openbeelden' self.ES_DOC_TYPE = 'mediaresource' self.es_local = Elasticsearch(host=LTV_ES_SETTINGS['host'], port=LTV_ES_SETTINGS['port']) def loadMediaResourceData(self, resourceUri, clientIP, loadAnnotations): mediaResource = MediaResource(resourceUri) #load the annotations (only named entities in this case) mediaResource = self.__getAllAnnotationsOfResource(mediaResource) #fetch the video metadata mediaResource = self.__getAllVideoMetadata(mediaResource, clientIP) #transform the mediaresource object to JSON and return it resp = simplejson.dumps(mediaResource, default=lambda obj: obj.__dict__) return resp def loadMediaResources(self, provider):#ignores provider return self.loadOpenBeeldenItemsFromES(0, []) def loadOpenBeeldenItemsFromES(self, offset, videos): query = { "query": { "match_all": {} }, "fields": [], "from": offset, "size": 300 } resp = self.es_local.search(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, body=query, timeout="10s") if resp and len(resp['hits']['hits']) > 0: print len(resp['hits']['hits']) vids = [] for hit in resp['hits']['hits']: vid = self.es_local.get(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=hit['_id']) vids.append(vid['_source']) for vd in vids: video = { 'id' : vd['id'].replace(':', '_'), 'title' : '; '.join(vd['title']), 'date' : '; '.join(vd['date']), 'locator' : self.__getMediumByExtension(vd['medium'], 'mp4'), 'thumbUrl' : self.__getMediumByExtension(vd['medium'], 'png'), 'thumbBaseUrl' : '' } videos.append(video) self.loadOpenBeeldenItemsFromES(offset + 300, videos) return {'videos' : videos} def __getMediumByExtension(self, mediums, extension): poster = None for m in mediums: if m.find('.%s' % extension) != -1: poster = m break return poster def __getAllAnnotationsOfResource(self, mediaResource): nes = [] """ nes.append(NamedEntity( label, entityType=LinkedTVDataUtils.getNEType(DCType, RDFType, OWLSameAs), subTypes=LinkedTVDataUtils.getDCTypes(DCType), disambiguationURL=OWLSameAs, start=start, end=end, annotationURI=annotationURI, relevance=r, confidence=c ) ) """ mediaResource.setNamedEntities(nes) return mediaResource def __getAllVideoMetadata(self, mediaResource, clientIP): print mediaResource.getId() vd = self.es_local.get(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=mediaResource.getId().replace('_', ':')) if vd: vd = vd['_source'] mediaResource.setVideoMetadata(vd) mediaResource.setPlayoutUrl(self.__getMediumByExtension(vd['medium'], 'mp4')) #set the video metadata in the mediaresource mediaResource.setTitle('; '.join(vd['title'])) mediaResource.setDate('; '.join(vd['date'])) mediaResource.setThumbBaseUrl(None) mediaResource.setSrtUrl(None) mediaResource.setSubtitles(None) return mediaResource def setupOAIPMHConnection(self): oai_oi_reader = MetadataReader( fields={ 'title': ('textList', 'oai_oi:oi/oi:title/text()'), 'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'), 'creator': ('textList', 'oai_oi:oi/oi:creator/text()'), 'subject': ('textList', 'oai_oi:oi/oi:subject/text()'), 'description': ('textList', 'oai_oi:oi/oi:description/text()'), 'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'), 'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'), 'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'), 'date': ('textList', 'oai_oi:oi/oi:date/text()'), 'type': ('textList', 'oai_oi:oi/oi:type/text()'), 'extent': ('textList', 'oai_oi:oi/oi:extent/text()'), 'medium': ('textList', 'oai_oi:oi/oi:medium/text()'), 'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'), 'source': ('textList', 'oai_oi:oi/oi:source/text()'), 'language': ('textList', 'oai_oi:oi/oi:language/text()'), 'references': ('textList', 'oai_oi:oi/oi:references/text()'), 'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'), 'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'), 'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'), 'license': ('textList', 'oai_oi:oi/oi:license/text()') }, namespaces={ 'oai_oi': 'http://www.openbeelden.nl/feeds/oai/', #'http://www.openarchives.org/OAI/2.0/oai_oi/', 'oi': 'http://www.openbeelden.nl/oai/' } ) URL = 'http://www.openbeelden.nl/feeds/oai/' #Initialize the OAI client self.registry = MetadataRegistry() self.registry.registerReader('oai_oi', oai_oi_reader) self.client = Client(URL, self.registry) #Test if the connection to the OAI-PMH provider works x = self.client.updateGranularity() x = self.client.identify() print 'identity %s' % x.repositoryName() print 'identity %s' % x.protocolVersion() print 'identity %s' % x.baseURL() """ for s in client.listSets(): print s """ #initialize the OpenSKOSHandler self.openSKOSHandler = OpenSKOSHandler() def reindex(self, provider = None): setupOAIPMHConnection() i = 0 extent = None item = None identifier = None for rec in self.client.listRecords(metadataPrefix=u'oai_oi', set=u'beeldengeluid'):#stichting_natuurbeelden, beeldengeluid header, metadata, about = rec extent = metadata.getField('extent')[0] item = { 'id' : header.identifier(), 'identifier' : self.getFieldData(metadata, 'identifier'), 'title' : self.getFieldData(metadata, 'title'), 'alternative' : self.getFieldData(metadata, 'alternative'), 'creator' : self.getFieldData(metadata, 'creator'), 'subject' : self.getFieldData(metadata, 'subject'), 'description' : self.getFieldData(metadata, 'description'), 'abstract' : self.getFieldData(metadata, 'abstract'), 'publisher' : self.getFieldData(metadata, 'publisher'), 'contributor' : self.getFieldData(metadata, 'contributor'), 'date' : self.getFieldData(metadata, 'date'), 'date2' : header.datestamp(), 'type' : self.getFieldData(metadata, 'type'), 'extent' : extent, 'medium' : self.getFieldData(metadata, 'medium'), 'source' : self.getFieldData(metadata, 'source'), 'language' : self.getFieldData(metadata, 'language'), 'references' : self.getFieldData(metadata, 'references'), 'spatial' : self.getFieldData(metadata, 'spatial'), 'attributionName' : self.getFieldData(metadata, 'attributionName'), 'attributionURL' : self.getFieldData(metadata, 'attributionURL'), 'license' : self.getFieldData(metadata, 'license'), 'durationSecs' : self.getExtentInSeconds(extent) } self.es_local.index(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=header.identifier(), body=item) print 'Done' return True def getGTAATermsBySubjects(self, subject, spatial): """Get the GTAA terms related to the subject""" gtaaTerms = self.getGTAATermsBasedOnSubjectAndLocation(subject, spatial) """If there is no identifier, try to fetch the taakID from iMMix ES""" if identifier == '' and source != '': print 'No taakID!' taakID = self.getTaakIDBasedOnSource(source) if taakID: print 'assigning taakID to the identifier' identifier = taakID return gtaaTerms def getFieldData(self, metadata, fn): #return '; '.join(metadata.getField(fn)) return metadata.getField(fn) def getExtentInSeconds(self, ext): secs = 0 if ext and ext.find('PT') != -1: ext = ext[2:len(ext)] if ext.find('H') != -1: secs = int(ext[0:ext.find('H')]) * 3600 ext = ext[ext.find('H') + 1:len(ext)] if ext.find('M') != -1: secs = int(ext[0:ext.find('M')]) * 60 ext = ext[ext.find('M') + 1:len(ext)] if ext.find('S') != -1: secs += int(ext[0:ext.find('S')]) return secs def secsToTimeString(self, secs): h = m = s = 0 while secs - 3600 >= 0: h += 1 secs -= 3600 while secs - 60 >= 0: m += 1 secs -= 60 return '%d:%d:%d' % (h, m, s) #Run de hoofdfunctie def getGTAATermsBasedOnSubjectAndLocation(self, subject, spatial): subs = None locs = None os_res = None gtaaExact = [] gtaaFuzzy = [] """First add GTAA terms based on the subject(s)""" if subject: subs = subject.split(';') for s in subs: os_res = self.openSKOSHandler.autoCompleteTable(s) if os_res: if len(os_res) == 1: gtaaExact.append('%s,%s' % (os_res[0]['label'], os_res[0]['value'])) elif len(os_res) > 1: for r in os_res: gtaaFuzzy.append('%s,%s' % (r['label'], r['value'])) """Append the GTAA terms based on the location(s)""" if spatial: locs = spatial.split(';') for l in locs: os_res = self.openSKOSHandler.autoCompleteTable(l, 'http://data.beeldengeluid.nl/gtaa/GeografischeNamen') if os_res: if len(os_res) == 1: gtaaExact.append('%s,%s' % (os_res[0]['label'], os_res[0]['value'])) elif len(os_res) > 1: for r in os_res: gtaaFuzzy.append('%s,%s' % (r['label'], r['value'])) return (gtaaExact, gtaaFuzzy) def getImmixMetadataBasedOnDrager(self, drager): global tot query = {"query":{"bool":{"must":[{"query_string":{"default_field":"positie.dragernummer","query":"\"%s\"" % drager}}],"must_not":[],"should":[]}}} #print query resp = es_local.search(index="search_expressie", doc_type="searchable_expressie", body=query, timeout="10s") #print resp if resp and resp['hits']['total'] == 1: for hit in resp['hits']['hits']: return hit elif resp and resp['hits']['total'] > 1: print 'more than one hit...' print resp return None def getTaakIDBasedOnSource(self, source): dragernrs = str(source).split('; ') drager = None """Get the drager from the source (sometimes there are two, but most of the times they are the same)""" if len(dragernrs) == 2: if dragernrs[0] != dragernrs[1]: print dragernrs print '>>>>>>>>>> There are two dragers...' else: drager = dragernrs[0] else: drager = dragernrs[0] """Try to find the taakID related to the drager""" if drager: md = self.getImmixMetadataBasedOnDrager(drager) if md: taakID = md['_source']['expressie']['niveau']['taakID'] if taakID: print 'Found a taakID: %s\t%s' % (drager, taakID) return taakID return None
def run(self): # Check that ElasticSearch is alive self.check_index() # If the user specified the --REBUILD flag, recreate the index if self.options['rebuild']: self.rebuild_index() # Connect to the repository registry = MetadataRegistry() registry.registerReader(self.settings["metadata_format"], self.settings["metadata_reader"]) client = Client(self.settings["uri"], registry) identity = client.identify() print "Connected to repository: %s" % identity.repositoryName() # got to update granularity or we barf with: # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z client.updateGranularity() # Initialise some variables batcher = Batch.Batch() total_records = 0 start = time.time() # Now do the synchonisation # If the user specified an identifier, then synchronise this record if (self.options['identifier'] is not None): total_records += self.synchronise_record( client, batcher, self.options['identifier']) else: # Else, synchronise using the date-range provided by the user, or failing that, # the date-range based on the last sync # Get the synchronisation config record synchronisation_config = self.get_synchronisation_config() if self.options["from_date"] is not None: # If the user specified a from-date argument, use it from_date = self.options[ "from_date"] # already a date (not a datetime) elif synchronisation_config is not None and "to_date" in synchronisation_config: # Else read the last synchronised to_date from the config, and add on a day from_date = dateutil.parser.parse( synchronisation_config["to_date"]).date() + timedelta( days=1) else: # Else use the default_from_date in the config from_date = dateutil.parser.parse( self.settings['default_from_date']).date() if self.options["to_date"] is not None: to_date = self.options[ "to_date"] # already a date (not a datetime) else: to_date = (date.today() - timedelta(days=1)) # Force the from_date to use time 00:00:00 from_date = datetime.combine( from_date, _time(hour=0, minute=0, second=0, microsecond=0)) # Force the to_date to use time 23:59:59 to_date = datetime.combine( to_date, _time(hour=23, minute=59, second=59, microsecond=0)) print "Synchronising from %s - %s" % (from_date, to_date) while from_date < to_date: next_date = datetime.combine( from_date.date() + timedelta(days=(self.settings['delta_days'] - 1)), _time(hour=23, minute=59, second=59, microsecond=0)) number_of_records = self.synchronise_period( client, batcher, from_date, next_date) batcher.clear() #Store the records in elasticsearch self.put_synchronisation_config(from_date, next_date, number_of_records) from_date += timedelta(days=(self.settings['delta_days'])) total_records += number_of_records # Pause so as not to get banned. to = 20 print "Sleeping for %i seconds so as not to get banned." % to time.sleep(to) # Store the records in the index batcher.clear() # Print out some statistics time_spent = time.time() - start print 'Total time spent: %d seconds' % (time_spent) if time_spent > 0.001: # careful as its not an integer print 'Total records synchronised: %i records (%d records/second)' % ( total_records, (total_records / time_spent)) else: print 'Total records synchronised: %i records' % (total_records) return total_records sys.exit()
def transfer_experiment(source): """ Pull public experiments from source into current mytardis. """ #TODO: Cleanup error messages #TODO: does not transfer liences as not part of METS format. #NOTE: As this is a pull we trust the data from the other tardis # Check identity of the feed from oaipmh.client import Client from oaipmh import error from oaipmh.metadata import MetadataRegistry, oai_dc_reader from django.core.cache import cache from django.utils.hashcompat import md5_constructor as md5 # The cache key consists of the task name and the MD5 digest # of the feed URL. cache_key = md5("token").hexdigest() lock_id = "%s-lock-%s" % ("consume_experiment", cache_key) LOCK_EXPIRE = 60 * 5 # cache.add fails if if the key already exists acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE) # memcache delete is very slow, but we have to use it to take # advantage of using add() for atomic locking release_lock = lambda: cache.delete(lock_id) registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) source_url = "%s/apps/oaipmh/?verb=Identify" % source client = Client(source_url, registry) try: identify = client.identify() except AttributeError as e: msg = "Error reading repos identity: %s:%s" % (source, e) logger.error(msg) raise ReposReadError(msg) except error.ErrorBase as e: msg = "OAIPMH error: %s" % e logger.error(msg) raise OAIPMHError(msg) except URLError as e: logger.error(e) raise repos = identify.baseURL() import urlparse repos_url = urlparse.urlparse(repos) dest_name = "%s://%s" % (repos_url.scheme, repos_url.netloc) if dest_name != source: msg = "Source directory reports incorrect name: %s" % dest_name logger.error(msg) raise BadAccessError(msg) # Get list of public experiments at sources registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client( source + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry) try: exps_metadata = [ meta for (header, meta, extra) in client.listRecords(metadataPrefix='oai_dc') ] except AttributeError as e: msg = "Error reading experiment %s" % e logger.error(msg) raise OAIPMHError(msg) except error.NoRecordsMatchError as e: msg = "no public records found on source %s" % e logger.warn(msg) return local_ids = [] for exp_metadata in exps_metadata: exp_id = exp_metadata.getField('identifier')[0] user = exp_metadata.getField('creator')[0] found_user = _get_or_create_user(source, user) #make sure experiment is publicish try: xmldata = getURL("%s/apps/reposproducer/expstate/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) try: exp_state = json.loads(xmldata) except ValueError as e: msg = "cannot parse public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not exp_state in [ Experiment.PUBLIC_ACCESS_FULL, Experiment.PUBLIC_ACCESS_METADATA ]: msg = 'cannot ingest private experiments.' % exp_id logger.error(msg) raise BadAccessError(msg) # Get the usernames of isOwner django_user ACLs for the experiment try: xmldata = getURL("%s/apps/reposproducer/acls/%s/" % (source, exp_id)) except HTTPError as e: msg = "Cannot get acl list of experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) try: acls = json.loads(xmldata) except ValueError as e: msg = "cannot parse acl list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) owners = [] for acl in acls: if acl['pluginId'] == 'django_user' and acl['isOwner']: user = _get_or_create_user(source, acl['entityId']) owners.append(user.username) else: # FIXME: skips all other types of acl for now pass # Get the METS for the experiment metsxml = "" try: metsxml = getURL("%s/experiment/metsexport/%s/?force_http_urls" % (source, exp_id)) #metsxml = getURL("%s/experiment/metsexport/%s/" #% (source, exp_id)) except HTTPError as e: msg = "cannot get METS for experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) # load schema and parametername for experiment keys try: key_schema = Schema.objects.get(namespace=settings.KEY_NAMESPACE) except Schema.DoesNotExist as e: msg = "No ExperimentKeyService Schema found" logger.error(msg) raise BadAccessError(msg) try: key_name = ParameterName.objects.get(name=settings.KEY_NAME) except ParameterName.DoesNotExist as e: msg = "No ExperimentKeyService ParameterName found" logger.error(msg) raise BadAccessError(msg) try: xmldata = getURL("%s/apps/reposproducer/key/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get key of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not xmldata: logger.warn( "Unable to retrieve experiment %s key. Will try again later" % exp_id) return try: key_value = json.loads(xmldata) except ValueError as e: msg = "cannot parse key list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not key_value: logger.warn( "Unable to retrieve experiment %s key value. Will try again later" % exp_id) return logger.debug("retrieved key %s from experiment %s" % (key_value, exp_id)) exps = Experiment.objects.all() got_lock = True if not acquire_lock(): logger.warning("another worker has access to consume experiment") return duplicate_exp = 0 for exp in exps: #logger.warn("exp = %s" % exp.id) params = ExperimentParameter.objects.filter( name=key_name, parameterset__schema=key_schema, parameterset__experiment=exp) #logger.warn("params.count() = %s" % params.count()) if params.count() >= 1: key = params[0].string_value if key == key_value: duplicate_exp = exp.id #logger.warn("found duplicate for %s" % duplicate_exp) break if duplicate_exp: logger.warn( "Found duplicate experiment form %s exp %s to exp %s" % (source, exp_id, duplicate_exp)) if got_lock: release_lock() return # TODO: Need someway of updating and existing experiment. Problem is # that copy will have different id from original, so need unique identifier # to allow matching # We have not pulled everything we need from producer and are ready to create # experiment. # Make placeholder experiment and ready metadata e = Experiment( title='Placeholder Title', approved=True, created_by=found_user, public_access=exp_state, locked=False # so experiment can then be altered. ) e.save() # store the key #eps, was_created = ExperimentParameterSet.objects.\ # get_or_create(experiment=e, schema=key_schema) #if was_created: # logger.warn("was created") #ep, was_created = ExperimentParameter.objects.get_or_create(parameterset=eps, # name=key_name, # string_value=key_value) #if was_created: # logger.warn("was created again") #ep.save() if got_lock: release_lock() local_id = e.id filename = path.join(e.get_or_create_directory(), 'mets_upload.xml') f = open(filename, 'wb+') f.write(metsxml) f.close() # Ingest this experiment META data and isOwner ACLS eid = None try: eid, sync_path = _registerExperimentDocument(filename=filename, created_by=found_user, expid=local_id, owners=owners) logger.info('=== processing experiment %s: DONE' % local_id) except: # FIXME: what errors can mets return? msg = '=== processing experiment %s: FAILED!' \ % local_id logger.error(msg) raise MetsParseError(msg) # FIXME: if METS parse fails then we should go back and delete the placeholder experiment exp = Experiment.objects.get(id=eid) # so that tardis does not copy the data for datafile in exp.get_datafiles(): datafile.stay_remote = True datafile.save() #import nose.tools #nose.tools.set_trace() # FIXME: reverse lookup of URLs seem quite slow. # TODO: put this information into specific metadata schema attached to experiment exp.description += get_audit_message(source, exp_id) exp.save() local_ids.append(local_id) return local_ids
marcxml_reader = MARCXMLReader() # Defining of metadata Readers in the Registry from oaipmh import metadata registry = metadata.MetadataRegistry() registry.registerReader('oai_dc', metadata.oai_dc_reader) registry.registerReader('marc21', marcxml_reader) #### OAI-PMH Client processing oai = Client('http://snape.mzk.cz/OAI-script', registry) id = oai.identify() print id.repositoryName() print id.adminEmails() print id.baseURL() formats = oai.listMetadataFormats() pprint formats # 'marc21' sets = oai.listSets() for s in sets: print s # 'MZK03'
def transfer_experiment(source): """ Pull public experiments from source into current mytardis. """ #TODO: Cleanup error messages #TODO: does not transfer liences as not part of METS format. #NOTE: As this is a pull we trust the data from the other tardis # Check identity of the feed from oaipmh.client import Client from oaipmh import error from oaipmh.metadata import MetadataRegistry, oai_dc_reader from django.core.cache import cache from django.utils.hashcompat import md5_constructor as md5 # The cache key consists of the task name and the MD5 digest # of the feed URL. cache_key = md5("token").hexdigest() lock_id = "%s-lock-%s" % ("consume_experiment", cache_key) LOCK_EXPIRE = 60 * 5 # cache.add fails if if the key already exists acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE) # memcache delete is very slow, but we have to use it to take # advantage of using add() for atomic locking release_lock = lambda: cache.delete(lock_id) registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) source_url = "%s/apps/oaipmh/?verb=Identify" % source client = Client(source_url, registry) try: identify = client.identify() except AttributeError as e: msg = "Error reading repos identity: %s:%s" % (source, e) logger.error(msg) raise ReposReadError(msg) except error.ErrorBase as e: msg = "OAIPMH error: %s" % e logger.error(msg) raise OAIPMHError(msg) except URLError as e: logger.error(e) raise repos = identify.baseURL() import urlparse repos_url = urlparse.urlparse(repos) dest_name = "%s://%s" % (repos_url.scheme, repos_url.netloc) if dest_name != source: msg = "Source directory reports incorrect name: %s" % dest_name logger.error(msg) raise BadAccessError(msg) # Get list of public experiments at sources registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(source + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry) try: exps_metadata = [meta for (header, meta, extra) in client.listRecords(metadataPrefix='oai_dc')] except AttributeError as e: msg = "Error reading experiment %s" % e logger.error(msg) raise OAIPMHError(msg) except error.NoRecordsMatchError as e: msg = "no public records found on source %s" % e logger.warn(msg) return local_ids = [] for exp_metadata in exps_metadata: exp_id = exp_metadata.getField('identifier')[0] user = exp_metadata.getField('creator')[0] found_user = _get_or_create_user(source, user) #make sure experiment is publicish try: xmldata = getURL("%s/apps/reposproducer/expstate/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) try: exp_state = json.loads(xmldata) except ValueError as e: msg = "cannot parse public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not exp_state in [Experiment.PUBLIC_ACCESS_FULL, Experiment.PUBLIC_ACCESS_METADATA]: msg = 'cannot ingest private experiments.' % exp_id logger.error(msg) raise BadAccessError(msg) # Get the usernames of isOwner django_user ACLs for the experiment try: xmldata = getURL("%s/apps/reposproducer/acls/%s/" % (source, exp_id)) except HTTPError as e: msg = "Cannot get acl list of experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) try: acls = json.loads(xmldata) except ValueError as e: msg = "cannot parse acl list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) owners = [] for acl in acls: if acl['pluginId'] == 'django_user' and acl['isOwner']: user = _get_or_create_user(source, acl['entityId']) owners.append(user.username) else: # FIXME: skips all other types of acl for now pass # Get the METS for the experiment metsxml = "" try: metsxml = getURL("%s/experiment/metsexport/%s/?force_http_urls" % (source, exp_id)) #metsxml = getURL("%s/experiment/metsexport/%s/" #% (source, exp_id)) except HTTPError as e: msg = "cannot get METS for experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) # load schema and parametername for experiment keys try: key_schema = Schema.objects.get(namespace=settings.KEY_NAMESPACE) except Schema.DoesNotExist as e: msg = "No ExperimentKeyService Schema found" logger.error(msg) raise BadAccessError(msg) try: key_name = ParameterName.objects.get(name=settings.KEY_NAME) except ParameterName.DoesNotExist as e: msg = "No ExperimentKeyService ParameterName found" logger.error(msg) raise BadAccessError(msg) try: xmldata = getURL("%s/apps/reposproducer/key/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get key of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not xmldata: logger.warn("Unable to retrieve experiment %s key. Will try again later" % exp_id) return try: key_value = json.loads(xmldata) except ValueError as e: msg = "cannot parse key list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not key_value: logger.warn("Unable to retrieve experiment %s key value. Will try again later" % exp_id) return logger.debug("retrieved key %s from experiment %s" % (key_value, exp_id)) exps = Experiment.objects.all() got_lock = True if not acquire_lock(): logger.warning("another worker has access to consume experiment") return duplicate_exp = 0 for exp in exps: #logger.warn("exp = %s" % exp.id) params = ExperimentParameter.objects.filter(name=key_name, parameterset__schema=key_schema, parameterset__experiment=exp) #logger.warn("params.count() = %s" % params.count()) if params.count() >= 1: key = params[0].string_value if key == key_value: duplicate_exp = exp.id #logger.warn("found duplicate for %s" % duplicate_exp) break if duplicate_exp: logger.warn("Found duplicate experiment form %s exp %s to exp %s" % (source, exp_id, duplicate_exp)) if got_lock: release_lock() return # TODO: Need someway of updating and existing experiment. Problem is # that copy will have different id from original, so need unique identifier # to allow matching # We have not pulled everything we need from producer and are ready to create # experiment. # Make placeholder experiment and ready metadata e = Experiment( title='Placeholder Title', approved=True, created_by=found_user, public_access=exp_state, locked=False # so experiment can then be altered. ) e.save() # store the key #eps, was_created = ExperimentParameterSet.objects.\ # get_or_create(experiment=e, schema=key_schema) #if was_created: # logger.warn("was created") #ep, was_created = ExperimentParameter.objects.get_or_create(parameterset=eps, # name=key_name, # string_value=key_value) #if was_created: # logger.warn("was created again") #ep.save() if got_lock: release_lock() local_id = e.id filename = path.join(e.get_or_create_directory(), 'mets_upload.xml') f = open(filename, 'wb+') f.write(metsxml) f.close() # Ingest this experiment META data and isOwner ACLS eid = None try: eid, sync_path = _registerExperimentDocument(filename=filename, created_by=found_user, expid=local_id, owners=owners) logger.info('=== processing experiment %s: DONE' % local_id) except: # FIXME: what errors can mets return? msg = '=== processing experiment %s: FAILED!' \ % local_id logger.error(msg) raise MetsParseError(msg) # FIXME: if METS parse fails then we should go back and delete the placeholder experiment exp = Experiment.objects.get(id=eid) # so that tardis does not copy the data for datafile in exp.get_datafiles(): datafile.stay_remote = True datafile.save() #import nose.tools #nose.tools.set_trace() # FIXME: reverse lookup of URLs seem quite slow. # TODO: put this information into specific metadata schema attached to experiment exp.description += get_audit_message(source, exp_id) exp.save() local_ids.append(local_id) return local_ids