Python migrate示例，esgfpy.migrate.solr2solr.migrate Python示例

示例#1

0

显示文件

文件： harvester.py 项目： EarthSystemCoG/esgfpy-publish

 def _sync_records_by_id(self, core, query, timestamp_query):
     '''
     Method that executes synchronization of all cores based on the dataset id 
     (within a given time interval).
     '''
     
     # number of records copied from source Solr --> target Solr
     numDatasets = 0
     numFiles = 0
     numAggregations = 0
     
     # query for dataset ids from source, target Solrs
     print("Querying source")
     source_dataset_ids = self._query_dataset_ids(self.source_solr_base_url, CORE_DATASETS, query, timestamp_query)
     print("Querying target")
     target_dataset_ids = self._query_dataset_ids(self.target_solr_base_url, CORE_DATASETS, query, timestamp_query)
     
     # synchronize source Solr --> target Solr
     # commit after every core query
     for source_dataset_id in source_dataset_ids.keys():
         if source_dataset_id not in target_dataset_ids or source_dataset_ids[source_dataset_id] != target_dataset_ids[source_dataset_id]:
             logging.info("\t\t\t\tCopying source dataset=%s" % source_dataset_id)
             
             numDatasets += migrate(self.source_solr_base_url, self.target_solr_base_url, CORE_DATASETS, query='id:%s' % source_dataset_id,
                                    commit=True, optimize=False)
             numFiles += migrate(self.source_solr_base_url, self.target_solr_base_url, CORE_FILES, query='dataset_id:%s' % source_dataset_id,
                                 commit=True, optimize=False)
             numAggregations += migrate(self.source_solr_base_url, self.target_solr_base_url, CORE_AGGREGATIONS, query='dataset_id:%s' % source_dataset_id,
                                        commit=True, optimize=False)
     
     # synchronize target Solr <-- source Solr
     # must delete datasets that do NOT exist at the source
     for target_dataset_id in target_dataset_ids.keys():
         if not target_dataset_id in source_dataset_ids:
             # check wether dataset still exists at the source: if yes, it will be updated; if not, must delete
             exists = self._check_record(self.source_solr_base_url, CORE_DATASETS, target_dataset_id)
             if not exists:
                 logging.info("\t\t\t\tDeleting dataset=%s" % target_dataset_id)
                 self._delete_solr_records(self.target_solr_base_url, core=CORE_DATASETS, query='id:%s' % target_dataset_id)
                 self._delete_solr_records(self.target_solr_base_url, core=CORE_FILES, query='dataset_id:%s' % target_dataset_id)
                 self._delete_solr_records(self.target_solr_base_url, core=CORE_AGGREGATIONS, query='dataset_id:%s' % target_dataset_id)
                 
     return (numDatasets, numFiles, numAggregations)

示例#2

0

显示文件

 def _sync_records_by_time(self, core, query, timestamp_query):
     '''Method that executes synchronization of all records for a given core within given time interval.'''
     
     # first delete all records in timestamp bin from target solr
     # will NOT commit the changes yet
     delete_query = "(%s)AND(%s)" % (query, timestamp_query)
     self._delete_solr_records(self.target_solr_base_url, core, delete_query)
     
     # then migrate records from source solr
     # commit but do NOT optimize the index yet
     numRecords = migrate(self.source_solr_base_url, self.target_solr_base_url, core, query=query, fq=timestamp_query,
                          commit=True, optimize=False)
     logging.info("\t\t\tNumber or records migrated=%s" % numRecords)
     return numRecords

示例#3

0

显示文件

    def _sync_records_by_time(self, core, query, timestamp_query):
        '''
        Method that executes synchronization of all records
        for a given core within given time interval.
        '''

        # first delete all records in timestamp bin from target solr
        # will NOT commit the changes yet
        delete_query = "(%s)AND(%s)" % (query, timestamp_query)
        self._delete_solr_records(self.target_solr_base_url,
                                  core,
                                  delete_query)

        # then migrate records from source solr
        # commit but do NOT optimize the index yet
        numRecords = migrate(self.source_solr_base_url,
                             self.target_solr_base_url,
                             core, query=query, fq=timestamp_query,
                             commit=True, optimize=False)
        logging.info("\t\t\tNumber or records migrated=%s" % numRecords)
        return numRecords

示例#4

0

显示文件

文件： synchronizer.py 项目： ESGF/esgfpy-solr

    def _sync_all_cores_by_dataset_id(self, query, timestamp_query):
        '''
        Method that executes synchronization of all cores
        based on the dataset id (within a given time interval).
        '''

        # number of records copied from source Solr --> target Solr
        numDatasets = 0
        numFiles = 0
        numAggregations = 0

        # query for dataset ids from source, target Solrs
        source_dataset_ids = self._query_dataset_ids(self.source_solr_base_url,
                                                     CORE_DATASETS, query,
                                                     timestamp_query)
        target_dataset_ids = self._query_dataset_ids(self.target_solr_base_url,
                                                     CORE_DATASETS, query,
                                                     timestamp_query)

        # synchronize source Solr --> target Solr
        # commit after every core query
        for source_dataset_id in source_dataset_ids.keys():
            #  compare dataset ids and their _timestamps
            if ((source_dataset_id not in target_dataset_ids)
                    or (source_dataset_ids[source_dataset_id] !=
                        target_dataset_ids[source_dataset_id])):

                logging.info("\t\t\t\tCopying source dataset="
                             "%s" % source_dataset_id)

                numDatasets += migrate(self.source_solr_base_url,
                                       self.target_solr_base_url,
                                       CORE_DATASETS,
                                       query='id:%s' % source_dataset_id,
                                       commit=True,
                                       optimize=False)
                numFiles += migrate(self.source_solr_base_url,
                                    self.target_solr_base_url,
                                    CORE_FILES,
                                    query='dataset_id:%s' % source_dataset_id,
                                    commit=True,
                                    optimize=False)
                numAggregations += migrate(self.source_solr_base_url,
                                           self.target_solr_base_url,
                                           CORE_AGGREGATIONS,
                                           query='dataset_id:%s' %
                                           source_dataset_id,
                                           commit=True,
                                           optimize=False)

        # synchronize target Solr <-- source Solr
        # must delete datasets that do NOT longer exist at the source
        for target_dataset_id in target_dataset_ids.keys():
            if target_dataset_id not in source_dataset_ids:
                # check whether dataset still exists at the source:
                # if yes, it has been updated in the previous loop; if not, delete it
                exists = self._check_record(self.source_solr_base_url,
                                            CORE_DATASETS, target_dataset_id)
                if not exists:
                    logging.info("\t\t\t\tDeleting dataset="
                                 "%s" % target_dataset_id)
                    self._delete_solr_records(self.target_solr_base_url,
                                              core=CORE_DATASETS,
                                              query='id:%s' %
                                              target_dataset_id)
                    self._delete_solr_records(self.target_solr_base_url,
                                              core=CORE_FILES,
                                              query='dataset_id:%s' %
                                              target_dataset_id)
                    self._delete_solr_records(self.target_solr_base_url,
                                              core=CORE_AGGREGATIONS,
                                              query='dataset_id:%s' %
                                              target_dataset_id)

        return (numDatasets, numFiles, numAggregations)

示例#5

0

显示文件

文件： solr2scale.py 项目： EarthSystemCoG/esgfpy-publish

 
 sourceSolrUrl = "http://localhost:8983/solr"
 targetSolrUrl = "http://localhost:7000/solr"
 core = "datasets"
 #replace = None
 #replace = "pcmdi9.llnl.gov:esgf-node.jpl.nasa.gov"
 #replace = "pcmdi9.llnl.gov:others"
 
 # total number of records indexed = maxRecords * numIterations * replacements
 maxRecords = 10000    # maximum number of records per migration
 numIterations = 1000  # number of migrations
 
 replacements = ["pcmdi9.llnl.gov:esgf-node.jpl.nasa.gov",
                 "pcmdi9.llnl.gov:pcmdi9.llnl.gov",
                 "pcmdi9.llnl.gov:esgf-data.dkrz.de",
                 "pcmdi9.llnl.gov:esgf-node.ipsl.fr",
                 "pcmdi9.llnl.gov:esgf.nccs.nasa.gov",
                 "pcmdi9.llnl.gov:esg2.nci.org.au",
                 "pcmdi9.llnl.gov:esgf-index1.ceda.ac.uk",
                 "pcmdi9.llnl.gov:esgdata.gfdl.nooa.gov",
                 "pcmdi9.llnl.gov:hydra.fsl.noaa.gov",
                 "pcmdi9.llnl.gov:others"]
 
 for replace in replacements:
             
     for i in range(1, 1+numIterations):
         
         print "Executing iteration #: %s for replacement=%s" % (i, replace)
         suffix = ".%s" % i
         migrate(sourceSolrUrl, targetSolrUrl, core, maxRecords=maxRecords, suffix=suffix, replace=replace, 
                 query='index_node:pcmdi9.llnl.gov')