Exemplo n.º 1
def get_prep_id(dataset):
  dbs = dbsClient.DbsApi('https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
  dbs_response = dbs.listDatasetArray(dataset = dataset, detail = True)
  if not dbs_response:
    raise RuntimeError("Invalid dataset: %s" % dataset)
  prep_id = dbs_response[0]['prep_id']
  logging.debug('Got prep ID: {}'.format(prep_id))
  return prep_id
Exemplo n.º 2
def publishInDBS3(taskname):

    def createLogdir(dirname):
        Create the directory dirname ignoring erors in case it exists. Exit if
        the directory cannot be created.
        except OSError as ose:
            if ose.errno != 17: #ignore the "Directory already exists error"
                print("The task worker need to access the '%s' directory" % dirname)

    logger = logging.getLogger(taskname)
    logging.basicConfig(filename='taskLogs/'+taskname+'.log', level=logging.INFO, format=config.General.logMsgFormat)

    logger.info("Getting files to publish")

    toPublish = []
    # TODO move from new to done when processed
    with open("/tmp/"+taskname+".json") as f:
        toPublish = json.load(f)

    workflow = taskname

    if len(toPublish) == 0:
        return "EMPTY"

    if not workflow:
        logger.info("NO TASKNAME: %s" % toPublish[0])
    for k, v in toPublish[0].iteritems():
        if k == 'taskname':
            logger.info("Starting: %s: %s" % (k, v))
    wfnamemsg = "%s: " % (workflow)

    user = toPublish[0]["User"]
        group = toPublish[0]["Group"]
        role = toPublish[0]["Role"]
        group = ""
        role = ""

    if not group or group in ['null']:
        group = ""
    if not role or role in ['null']:
        role = ""

    userDN = toPublish[0]["UserDN"]
    pnn = toPublish[0]["Destination"]
    logger.info(wfnamemsg+" "+user)

    READ_PATH = "/DBSReader"
    READ_PATH_1 = "/DBSReader/"

    # TODO: get user role and group
        proxy = Proxy(userDN, group, role, logger)
        logger.exception("Failed to retrieve user proxy")
        return "FAILED"

    oracelInstance = config.General.oracleDB
    oracleDB = HTTPRequests(oracelInstance,

    fileDoc = dict()
    fileDoc['subresource'] = 'search'
    fileDoc['workflow'] = workflow

        results = oracleDB.get(task_path,
    except Exception as ex:
        logger.error("Failed to get acquired publications from oracleDB for %s: %s" % (workflow, ex))
        return "FAILED"


        inputDatasetIndex = results[0]['desc']['columns'].index("tm_input_dataset")
        inputDataset = results[0]['result'][inputDatasetIndex]
        sourceURLIndex = results[0]['desc']['columns'].index("tm_dbs_url")
        sourceURL = results[0]['result'][sourceURLIndex]
        publish_dbs_urlIndex = results[0]['desc']['columns'].index("tm_publish_dbs_url")
        publish_dbs_url = results[0]['result'][publish_dbs_urlIndex]

        #sourceURL = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
        if not sourceURL.endswith(READ_PATH) and not sourceURL.endswith(READ_PATH_1):
            sourceURL += READ_PATH

    except Exception:
    ## When looking up parents may need to look in global DBS as well.
    globalURL = sourceURL
    globalURL = globalURL.replace('phys01', 'global')
    globalURL = globalURL.replace('phys02', 'global')
    globalURL = globalURL.replace('phys03', 'global')
    globalURL = globalURL.replace('caf', 'global')

    pr = os.environ.get("SOCKS5_PROXY")
    logger.info(wfnamemsg+"Source API URL: %s" % sourceURL)
    sourceApi = dbsClient.DbsApi(url=sourceURL, proxy=pr)
    logger.info(wfnamemsg+"Global API URL: %s" % globalURL)
    globalApi = dbsClient.DbsApi(url=globalURL, proxy=pr)

    WRITE_PATH = "/DBSWriter"
    MIGRATE_PATH = "/DBSMigrate"
    READ_PATH = "/DBSReader"

    if publish_dbs_url.endswith(WRITE_PATH):
        publish_read_url = publish_dbs_url[:-len(WRITE_PATH)] + READ_PATH
        publish_migrate_url = publish_dbs_url[:-len(WRITE_PATH)] + MIGRATE_PATH
        publish_migrate_url = publish_dbs_url + MIGRATE_PATH
        publish_read_url = publish_dbs_url + READ_PATH
        publish_dbs_url += WRITE_PATH

        logger.debug(wfnamemsg+"Destination API URL: %s" % publish_dbs_url)
        destApi = dbsClient.DbsApi(url=publish_dbs_url, proxy=pr)
        logger.debug(wfnamemsg+"Destination read API URL: %s" % publish_read_url)
        destReadApi = dbsClient.DbsApi(url=publish_read_url, proxy=pr)
        logger.debug(wfnamemsg+"Migration API URL: %s" % publish_migrate_url)
        migrateApi = dbsClient.DbsApi(url=publish_migrate_url, proxy=pr)
        logger.exception('Wrong DBS URL %s' % publish_dbs_url)
        return "FAILED"

    logger.info("inputDataset: %s" % inputDataset)
    noInput = len(inputDataset.split("/")) <= 3

    # TODO: fix dbs dep
    if not noInput:
            existing_datasets = sourceApi.listDatasets(dataset=inputDataset, detail=True, dataset_access_type='*')
            primary_ds_type = existing_datasets[0]['primary_ds_type']
            # There's little chance this is correct, but it's our best guess for now.
            # CRAB2 uses 'crab2_tag' for all cases
            existing_output = destReadApi.listOutputConfigs(dataset=inputDataset)
            logger.exception('Wrong DBS URL %s' % publish_dbs_url)
            return "FAILED"
        if not existing_output:
            msg = "Unable to list output config for input dataset %s." % (inputDataset)
            global_tag = 'crab3_tag'
            global_tag = existing_output[0]['global_tag']
        msg = "This publication appears to be for private MC."
        primary_ds_type = 'mc'
        global_tag = 'crab3_tag'

    acquisition_era_name = "CRAB"
    processing_era_config = {'processing_version': 1, 'description': 'CRAB3_processing_era'}

    appName = 'cmsRun'
    appVer = toPublish[0]["swversion"]
    pset_hash = toPublish[0]['publishname'].split("-")[-1]
    gtag = str(toPublish[0]['globaltag'])
    if gtag == "None":
        gtag = global_tag
        if toPublish[0]['acquisitionera'] and not toPublish[0]['acquisitionera'] in ["null"]:
            acquisitionera = str(toPublish[0]['acquisitionera'])
            acquisitionera = acquisition_era_name
        acquisitionera = acquisition_era_name

    _, primName, procName, tier = toPublish[0]['outdataset'].split('/')

    primds_config = {'primary_ds_name': primName, 'primary_ds_type': primary_ds_type}
    msg = "About to insert primary dataset: %s" % (str(primds_config))
    msg = "Successfully inserted primary dataset %s." % (primName)

    final = {}
    failed = []
    publish_in_next_iteration = []
    published = []

    dataset = toPublish[0]['outdataset']
    # Find all (valid) files already published in this dataset.
        existingDBSFiles = destReadApi.listFiles(dataset=dataset, detail=True)
        existingFiles = [f['logical_file_name'] for f in existingDBSFiles]
        existingFilesValid = [f['logical_file_name'] for f in existingDBSFiles if f['is_file_valid']]
        msg = "Dataset %s already contains %d files" % (dataset, len(existingFiles))
        msg += " (%d valid, %d invalid)." % (len(existingFilesValid), len(existingFiles) - len(existingFilesValid))
        final['existingFiles'] = len(existingFiles)
    except Exception as ex:
        msg = "Error when listing files in DBS: %s" % (str(ex))
        msg += "\n%s" % (str(traceback.format_exc()))
        return "FAILED"

    # check if actions are needed
    workToDo = False

    for fileTo in toPublish:
        if fileTo['lfn'] not in existingFilesValid:
            workToDo = True

    if not workToDo:
        msg = "Nothing uploaded, %s has these files already or not enough files." % (dataset)
        return "NOTHING TO DO"

    acquisition_era_config = {'acquisition_era_name': acquisitionera, 'start_date': 0}

    output_config = {'release_version': appVer,
                     'pset_hash': pset_hash,
                     'app_name': appName,
                     'output_module_label': 'o',
                     'global_tag': global_tag,
    msg = "Published output config."

    dataset_config = {'dataset': dataset,
                      'processed_ds_name': procName,
                      'data_tier_name': tier,
                      'acquisition_era_name': acquisitionera,
                      'dataset_access_type': 'VALID',
                      'physics_group_name': 'CRAB3',
                      'last_modification_date': int(time.time()),
    msg = "About to insert dataset: %s" % (str(dataset_config))
    del dataset_config['acquisition_era_name']

    # List of all files that must (and can) be published.
    dbsFiles = []
    dbsFiles_f = []
    # Set of all the parent files from all the files requested to be published.
    parentFiles = set()
    # Set of parent files for which the migration to the destination DBS instance
    # should be skipped (because they were not found in DBS).
    parentsToSkip = set()
    # Set of parent files to migrate from the source DBS instance
    # to the destination DBS instance.
    localParentBlocks = set()
    # Set of parent files to migrate from the global DBS instance
    # to the destination DBS instance.
    globalParentBlocks = set()

    # Loop over all files to publish.
    for file_ in toPublish:
        # Check if this file was already published and if it is valid.
        if file_['lfn'] not in existingFilesValid:
            # We have a file to publish.
            # Get the parent files and for each parent file do the following:
            # 1) Add it to the list of parent files.
            # 2) Find the block to which it belongs and insert that block name in
            #    (one of) the set of blocks to be migrated to the destination DBS.
            for parentFile in list(file_['parents']):
                if parentFile not in parentFiles:
                    # Is this parent file already in the destination DBS instance?
                    # (If yes, then we don't have to migrate this block.)
                    blocksDict = destReadApi.listBlocks(logical_file_name=parentFile)
                    if not blocksDict:
                        # No, this parent file is not in the destination DBS instance.
                        # Maybe it is in the same DBS instance as the input dataset?
                        blocksDict = sourceApi.listBlocks(logical_file_name=parentFile)
                        if blocksDict:
                            # Yes, this parent file is in the same DBS instance as the input dataset.
                            # Add the corresponding block to the set of blocks from the source DBS
                            # instance that have to be migrated to the destination DBS.
                            # No, this parent file is not in the same DBS instance as input dataset.
                            # Maybe it is in global DBS instance?
                            blocksDict = globalApi.listBlocks(logical_file_name=parentFile)
                            if blocksDict:
                                # Yes, this parent file is in global DBS instance.
                                # Add the corresponding block to the set of blocks from global DBS
                                # instance that have to be migrated to the destination DBS.
                    # If this parent file is not in the destination DBS instance, is not
                    # the source DBS instance, and is not in global DBS instance, then it
                    # means it is not known to DBS and therefore we can not migrate it.
                    # Put it in the set of parent files for which migration should be skipped.
                    if not blocksDict:
                # If this parent file should not be migrated because it is not known to DBS,
                # we remove it from the list of parents in the file-to-publish info dictionary
                # (so that when publishing, this "parent" file will not appear as a parent).
                if parentFile in parentsToSkip:
                    msg = "Skipping parent file %s, as it doesn't seem to be known to DBS." % (parentFile)
                    if parentFile in file_['parents']:
            # Add this file to the list of files to be published.
        #print file
    # Print a message with the number of files to publish.
    msg = "Found %d files not already present in DBS which will be published." % (len(dbsFiles))

    # If there are no files to publish, continue with the next dataset.
    if len(dbsFiles_f) == 0:
        msg = "Nothing to do for this dataset."
        return "NOTHING TO DO"

    # Migrate parent blocks before publishing.
    # First migrate the parent blocks that are in the same DBS instance
    # as the input dataset.
    if localParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (sourceApi.url, localParentBlocks)
        statusCode, failureMsg = migrateByBlockDBS3(workflow,
        if statusCode:
            failureMsg += " Not publishing any files."
            failed.extend([f['SourceLFN'] for f in dbsFiles_f])
            failure_reason = failureMsg
            published = [x for x in published[dataset] if x not in failed[dataset]]
            return "NOTHING TO DO"
    # Then migrate the parent blocks that are in the global DBS instance.
    if globalParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (globalApi.url, globalParentBlocks)
        statusCode, failureMsg = migrateByBlockDBS3(workflow, migrateApi, destReadApi, globalApi, inputDataset, globalParentBlocks)
        if statusCode:
            failureMsg += " Not publishing any files."
            failed.extend([f['SourceLFN'] for f in dbsFiles_f])
            failure_reason = failureMsg
            published = [x for x in published[dataset] if x not in failed[dataset]]
            return "NOTHING TO DO"
    # Publish the files in blocks. The blocks must have exactly max_files_per_block
    # files, unless there are less than max_files_per_block files to publish to
    # begin with. If there are more than max_files_per_block files to publish,
    # publish as many blocks as possible and leave the tail of files for the next
    # PublisherWorker call, unless forced to published.
    block_count = 0
    count = 0
    max_files_per_block = config.General.max_files_per_block
    while True:
        block_name = "%s#%s" % (dataset, str(uuid.uuid4()))
        files_to_publish = dbsFiles[count:count+max_files_per_block]
            block_config = {'block_name': block_name, 'origin_site_name': pnn, 'open_for_writing': 0}
            msg = "Inserting files %s into block %s." % ([f['logical_file_name']
                                                          for f in files_to_publish], block_name)
            blockDump = createBulkBlock(output_config, processing_era_config,
                                        primds_config, dataset_config,
                                        acquisition_era_config, block_config, files_to_publish)
            #logger.debug(wfnamemsg+"Block to insert: %s\n %s" % (blockDump, destApi.__dict__ ))

            block_count += 1
        except Exception as ex:
            logger.error("Error for files: %s" % [f['SourceLFN'] for f in toPublish])
            failed.extend([f['SourceLFN'] for f in toPublish])
            msg = "Error when publishing (%s) " % ", ".join(failed)
            msg += str(ex)
            msg += str(traceback.format_exc())
            failure_reason = str(ex)
        count += max_files_per_block
        files_to_publish_next = dbsFiles_f[count:count+max_files_per_block]
        if len(files_to_publish_next) < max_files_per_block:
            publish_in_next_iteration.extend([f["SourceLFN"] for f in files_to_publish_next])
    published = [x for x in published if x not in failed + publish_in_next_iteration]
    # Fill number of files/blocks published for this dataset.
    final['files'] = len(dbsFiles) - len(failed) - len(publish_in_next_iteration)
    final['blocks'] = block_count
    # Print a publication status summary for this dataset.
    msg = "End of publication status for dataset %s:" % (dataset)
    msg += " failed (%s) %s" % (len(failed), failed)
    msg += ", published (%s) %s" % (len(published), published)
    msg += ", publish_in_next_iteration (%s) %s" % (len(publish_in_next_iteration),
    msg += ", results %s" % (final)

        if published:
            mark_good(workflow, published, oracleDB, logger)
        if failed:
            logger.debug("Failed files: %s " % failed)
            mark_failed(failed, oracleDB, logger, failure_reason)
        logger.exception("Status update failed")

    return 0
Exemplo n.º 3
def publishInDBS3(config, taskname, verbose):
    Publish output from one task in DBS

    def mark_good(files, crabServer, logger):
        Mark the list of files as tranferred

        msg = "Marking %s file(s) as published." % len(files)
        if dryRun:
            logger.info("DryRun: skip marking good file")

        nMarked = 0
        for lfn in files:
            data = {}
            source_lfn = lfn
            docId = getHashLfn(source_lfn)
            data['asoworker'] = config.General.asoworker
            data['subresource'] = 'updatePublication'
            data['list_of_ids'] = [docId]
            data['list_of_publication_state'] = ['DONE']
            data['list_of_retry_value'] = [1]
            data['list_of_failure_reason'] = ['']

                result = crabServer.post(api='filetransfers', data=encodeRequest(data))
                logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result)
            except Exception as ex:
                logger.error("Error updating status for DocumentId: %s lfn: %s", docId, source_lfn)
                logger.error("Error reason: %s", ex)

            nMarked += 1
            if nMarked % 10 == 0:
                logger.info('marked %d files', nMarked)

    def mark_failed(files, crabServer, logger, failure_reason=""):
        Something failed for these files so increment the retry count
        msg = "Marking %s file(s) as failed" % len(files)
        if dryRun:
            logger.debug("DryRun: skip marking failes files")

        nMarked = 0
        for lfn in files:
            source_lfn = lfn
            docId = getHashLfn(source_lfn)
            data = dict()
            data['asoworker'] = config.General.asoworker
            data['subresource'] = 'updatePublication'
            data['list_of_ids'] = [docId]
            data['list_of_publication_state'] = ['FAILED']
            data['list_of_retry_value'] = [1]
            data['list_of_failure_reason'] = [failure_reason]

            logger.debug("data: %s ", data)
                result = crabServer.post(api='filetransfers', data=encodeRequest(data))
                logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result)
            except Exception as ex:
                logger.error("Error updating status for DocumentId: %s lfn: %s", docId, source_lfn)
                logger.error("Error reason: %s", ex)

            nMarked += 1
            if nMarked % 10 == 0:
                logger.info('marked %d files', nMarked)

    def createLogdir(dirname):
        Create the directory dirname ignoring erors in case it exists. Exit if
        the directory cannot be created.
        except OSError as ose:
            if ose.errno != 17: #ignore the "Directory already exists error"
                print("The task worker need to access the '%s' directory" % dirname)

    def saveSummaryJson(logdir, summary):
        Save a publication summary as JSON. Make a new file every time this script runs
        :param summary: a summary disctionary. Must at least have key 'taskname'
        :param logdir: the directory where to write the summary
        :return: the full path name of the written file
        taskname = summary['taskname']
        counter = 1
        summaryFileName = os.path.join(logdir, taskname + '-1.json')
        while os.path.exists(summaryFileName):
            counter += 1
            summaryFileName = os.path.join(logdir, taskname + '-%d.json' % counter)
        with open(summaryFileName, 'w') as fd:
            json.dump(summary, fd)
        return summaryFileName

    taskFilesDir = config.General.taskFilesDir
    dryRun = config.TaskPublisher.dryRun
    username = taskname.split(':')[1].split('_')[0]
    logdir = os.path.join(config.General.logsDir, 'tasks', username)
    logfile = os.path.join(logdir, taskname + '.log')
    migrationLogDir = os.path.join(config.General.logsDir, 'migrations')
    logger = logging.getLogger(taskname)
    logging.basicConfig(filename=logfile, level=logging.INFO, format=config.TaskPublisher.logMsgFormat)
    if verbose:

    logger.info("Start new iteration on taskname:  %s\nGet files to publish", taskname)

    # prepare a dummy summary JSON file in case there's nothing to do
    nothingToDo = {}
    nothingToDo['taskname'] = taskname
    nothingToDo['result'] = 'OK'
    nothingToDo['reason'] = 'NOTHING TO DO'
    nothingToDo['publishedBlocks'] = 0
    nothingToDo['failedBlocks'] = 0
    nothingToDo['failedBlockDumps'] = []
    nothingToDo['publishedFiles'] = 0
    nothingToDo['failedFiles'] = 0
    nothingToDo['nextIterFiles'] = 0

    toPublish = []
    # TODO move from new to done when processed
    fname = taskFilesDir + taskname + ".json"
    with open(fname) as f:
        toPublish = json.load(f)

    if not toPublish:
        logger.info("Empty data file %s", fname)
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    pnn = toPublish[0]["Destination"]
    dataset = toPublish[0]['outdataset']
    logger.info("Will publish user files in %s", dataset)

    # CRABServer REST API's (see CRABInterface)
        instance = config.General.instance
        msg = "No instance provided: need to specify config.General.instance in the configuration"
        raise ConfigException(msg)

    if instance in SERVICE_INSTANCES:
        logger.info('Will connect to CRAB service: %s', instance)
        restHost = SERVICE_INSTANCES[instance]['restHost']
        dbInstance = SERVICE_INSTANCES[instance]['dbInstance']
        msg = "Invalid instance value '%s'" % instance
        raise ConfigException(msg)
    if instance == 'other':
        logger.info('Will use restHost and dbInstance from config file')
            restHost = config.General.restHost
            dbInstance = config.General.dbInstance
            msg = "Need to specify config.General.restHost and dbInstance in the configuration"
            raise ConfigException(msg)

    restURInoAPI = '/crabserver/' + dbInstance
    logger.info('Will connect to CRAB Data Base via URL: https://%s/%s', restHost, restURInoAPI)

    crabServer = CRABRest(hostname=restHost, localcert=config.General.serviceCert,
                          localkey=config.General.serviceKey, retry=3,

    data = dict()
    data['subresource'] = 'search'
    data['workflow'] = taskname

        results = crabServer.get(api='task', data=encodeRequest(data))
    except Exception as ex:
        logger.error("Failed to get acquired publications from oracleDB: %s", ex)
        nothingToDo['result'] = 'FAIL'
        nothingToDo['reason'] = 'Error contacting CRAB REST'
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    if verbose:

        inputDatasetIndex = results[0]['desc']['columns'].index("tm_input_dataset")
        inputDataset = results[0]['result'][inputDatasetIndex]
        sourceURLIndex = results[0]['desc']['columns'].index("tm_dbs_url")
        sourceURL = results[0]['result'][sourceURLIndex]
        publish_dbs_urlIndex = results[0]['desc']['columns'].index("tm_publish_dbs_url")
        publish_dbs_url = results[0]['result'][publish_dbs_urlIndex]

        if not sourceURL.endswith("/DBSReader") and not sourceURL.endswith("/DBSReader/"):
            sourceURL += "/DBSReader"
    except Exception:

    # When looking up parents may need to look in global DBS as well.
    globalURL = sourceURL
    globalURL = globalURL.replace('phys01', 'global')
    globalURL = globalURL.replace('phys02', 'global')
    globalURL = globalURL.replace('phys03', 'global')
    globalURL = globalURL.replace('caf', 'global')

    # allow to use a DBS REST host different from cmsweb.cern.ch (which is the
    # default inserted by CRAB Client)
    sourceURL = sourceURL.replace('cmsweb.cern.ch', config.TaskPublisher.DBShost)
    globalURL = globalURL.replace('cmsweb.cern.ch', config.TaskPublisher.DBShost)
    publish_dbs_url = publish_dbs_url.replace('cmsweb.cern.ch', config.TaskPublisher.DBShost)

    # DBS client relies on X509 env. vars
    os.environ['X509_USER_CERT'] = config.General.serviceCert
    os.environ['X509_USER_KEY'] = config.General.serviceKey

    # create DBS API objects
    logger.info("DBS Source API URL: %s", sourceURL)
    sourceApi = dbsClient.DbsApi(url=sourceURL)
    logger.info("DBS Global API URL: %s", globalURL)
    globalApi = dbsClient.DbsApi(url=globalURL)

    if publish_dbs_url.endswith('/DBSWriter'):
        publish_read_url = publish_dbs_url[:-len('/DBSWriter')] + '/DBSReader'
        publish_migrate_url = publish_dbs_url[:-len('/DBSWriter')] + '/DBSMigrate'
        publish_migrate_url = publish_dbs_url + '/DBSMigrate'
        publish_read_url = publish_dbs_url + '/DBSReader'
        publish_dbs_url += '/DBSWriter'
        logger.info("DBS Destination API URL: %s", publish_dbs_url)
        destApi = dbsClient.DbsApi(url=publish_dbs_url)
        logger.info("DBS Destination read API URL: %s", publish_read_url)
        destReadApi = dbsClient.DbsApi(url=publish_read_url)
        logger.info("DBS Migration API URL: %s", publish_migrate_url)
        migrateApi = dbsClient.DbsApi(url=publish_migrate_url)
    except Exception:
        logger.exception('Wrong DBS URL %s', publish_dbs_url)
        nothingToDo['result'] = 'FAIL'
        nothingToDo['reason'] = 'Error contacting DBS'
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    logger.info("inputDataset: %s", inputDataset)
    noInput = len(inputDataset.split("/")) <= 3

    if not noInput:
            existing_datasets = sourceApi.listDatasets(dataset=inputDataset, detail=True, dataset_access_type='*')
            primary_ds_type = existing_datasets[0]['primary_ds_type']
            # There's little chance this is correct, but it's our best guess for now.
            # CRAB2 uses 'crab2_tag' for all cases
            existing_output = destReadApi.listOutputConfigs(dataset=inputDataset)
        except Exception:
            logger.exception('Wrong DBS URL %s', publish_dbs_url)
            nothingToDo['result'] = 'FAIL'
            nothingToDo['reason'] = 'Error looking up input dataset in DBS'
            summaryFileName = saveSummaryJson(logdir, nothingToDo)
            return summaryFileName
        if not existing_output:
            msg = "Unable to list output config for input dataset %s." % (inputDataset)
            global_tag = 'crab3_tag'
            global_tag = existing_output[0]['global_tag']
        msg = "This publication appears to be for private MC."
        primary_ds_type = 'mc'
        global_tag = 'crab3_tag'

    acquisition_era_name = "CRAB"
    processing_era_config = {'processing_version': 1, 'description': 'CRAB3_processing_era'}

    appName = 'cmsRun'
    appVer = toPublish[0]["swversion"]
    pset_hash = toPublish[0]['publishname'].split("-")[-1]
    gtag = str(toPublish[0]['globaltag'])
    if gtag == "None":
        gtag = global_tag
        if toPublish[0]['acquisitionera'] and not toPublish[0]['acquisitionera'] in ["null"]:
            acquisitionera = str(toPublish[0]['acquisitionera'])
            acquisitionera = acquisition_era_name
    except Exception:
        acquisitionera = acquisition_era_name

    _, primName, procName, tier = toPublish[0]['outdataset'].split('/')

    primds_config = {'primary_ds_name': primName, 'primary_ds_type': primary_ds_type}
    msg = "About to insert primary dataset"
    if dryRun:
        logger.info("DryRun: skip insertPrimaryDataset")
        msg = "Successfully inserted primary dataset %s." % (primName)

    final = {}
    failed = []
    publish_in_next_iteration = []
    published = []

    # Find all files already published in this dataset.
        existingDBSFiles = destReadApi.listFiles(dataset=dataset, detail=True)
        existingFiles = [f['logical_file_name'] for f in existingDBSFiles]
        existingFilesValid = [f['logical_file_name'] for f in existingDBSFiles if f['is_file_valid']]
        msg = "Dataset %s already contains %d files" % (dataset, len(existingFiles))
        msg += " (%d valid, %d invalid)." % (len(existingFilesValid), len(existingFiles) - len(existingFilesValid))
        final['existingFiles'] = len(existingFiles)
    except Exception as ex:
        msg = "Error when listing files in DBS: %s" % (str(ex))
        msg += "\n%s" % (str(traceback.format_exc()))
        nothingToDo['result'] = 'FAIL'
        nothingToDo['reason'] = 'Error listing existing files in DBS'
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    # check if actions are needed
    workToDo = False

    for fileTo in toPublish:
        if fileTo['lfn'] not in existingFiles:
            workToDo = True

    if not workToDo:
        msg = "Nothing uploaded, output dataset has these files already."
        logger.info('Make sure those files are marked as Done')
        # docId is the has of the source LFN i.e. the file in the tmp area at the running site
        files = [f['SourceLFN'] for f in toPublish]
        mark_good(files, crabServer, logger)
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    acquisition_era_config = {'acquisition_era_name': acquisitionera, 'start_date': 0}

    output_config = {'release_version': appVer,
                     'pset_hash': pset_hash,
                     'app_name': appName,
                     'output_module_label': 'o',
                     'global_tag': global_tag,

    dataset_config = {'dataset': dataset,
                      'processed_ds_name': procName,
                      'data_tier_name': tier,
                      'dataset_access_type': 'VALID',
                      'physics_group_name': 'CRAB3',
                      'last_modification_date': int(time.time()),

    logger.info("Output dataset config: %s", str(dataset_config))

    # List of all files that must (and can) be published.
    dbsFiles = []
    dbsFiles_f = []
    # Set of all the parent files from all the files requested to be published.
    parentFiles = set()
    # Set of parent files for which the migration to the destination DBS instance
    # should be skipped (because they were not found in DBS).
    parentsToSkip = set()
    # Set of parent files to migrate from the source DBS instance
    # to the destination DBS instance.
    localParentBlocks = set()
    # Set of parent files to migrate from the global DBS instance
    # to the destination DBS instance.
    globalParentBlocks = set()

    # Loop over all files to publish.
    for file_ in toPublish:
        if verbose:
        # Check if this file was already published and if it is valid.
        if file_['lfn'] not in existingFilesValid:
            # We have a file to publish.
            # Get the parent files and for each parent file do the following:
            # 1) Add it to the list of parent files.
            # 2) Find the block to which it belongs and insert that block name in
            #    (one of) the set of blocks to be migrated to the destination DBS.
            for parentFile in list(file_['parents']):
                if parentFile not in parentFiles:
                    # Is this parent file already in the destination DBS instance?
                    # (If yes, then we don't have to migrate this block.)
                    blocksDict = destReadApi.listBlocks(logical_file_name=parentFile)
                    if not blocksDict:
                        # No, this parent file is not in the destination DBS instance.
                        # Maybe it is in the same DBS instance as the input dataset?
                        blocksDict = sourceApi.listBlocks(logical_file_name=parentFile)
                        if blocksDict:
                            # Yes, this parent file is in the same DBS instance as the input dataset.
                            # Add the corresponding block to the set of blocks from the source DBS
                            # instance that have to be migrated to the destination DBS.
                            # No, this parent file is not in the same DBS instance as input dataset.
                            # Maybe it is in global DBS instance?
                            blocksDict = globalApi.listBlocks(logical_file_name=parentFile)
                            if blocksDict:
                                # Yes, this parent file is in global DBS instance.
                                # Add the corresponding block to the set of blocks from global DBS
                                # instance that have to be migrated to the destination DBS.
                    # If this parent file is not in the destination DBS instance, is not
                    # the source DBS instance, and is not in global DBS instance, then it
                    # means it is not known to DBS and therefore we can not migrate it.
                    # Put it in the set of parent files for which migration should be skipped.
                    if not blocksDict:
                # If this parent file should not be migrated because it is not known to DBS,
                # we remove it from the list of parents in the file-to-publish info dictionary
                # (so that when publishing, this "parent" file will not appear as a parent).
                if parentFile in parentsToSkip:
                    msg = "Skipping parent file %s, as it doesn't seem to be known to DBS." % (parentFile)
                    if parentFile in file_['parents']:
            # Add this file to the list of files to be published.
        #print file

    # Print a message with the number of files to publish.
    msg = "Found %d files not already present in DBS which will be published." % (len(dbsFiles))

    # If there are no files to publish, continue with the next dataset.
    if not dbsFiles_f:
        msg = "No file to publish to do for this dataset."
        summaryFileName = saveSummaryJson(logdir, nothingToDo)
        return summaryFileName

    # Migrate parent blocks before publishing.
    # First migrate the parent blocks that are in the same DBS instance
    # as the input dataset.
    if localParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (sourceApi.url, localParentBlocks)
        if dryRun:
            logger.info("DryRun: skipping migration request")
            statusCode, failureMsg = migrateByBlockDBS3(taskname, migrateApi, destReadApi, sourceApi,
                                                        inputDataset, localParentBlocks, migrationLogDir, verbose)
            if statusCode:
                failureMsg += " Not publishing any files."
                summaryFileName = saveSummaryJson(logdir, nothingToDo)
                return summaryFileName
    # Then migrate the parent blocks that are in the global DBS instance.
    if globalParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (globalApi.url, globalParentBlocks)
        if dryRun:
            logger.info("DryRun: skipping migration request")
            statusCode, failureMsg = migrateByBlockDBS3(taskname, migrateApi, destReadApi, globalApi,
                                                        inputDataset, globalParentBlocks, migrationLogDir, verbose)
            if statusCode:
                failureMsg += " Not publishing any files."
                summaryFileName = saveSummaryJson(logdir, nothingToDo)
                return summaryFileName
    # Publish the files in blocks. The blocks must have exactly max_files_per_block
    # files, unless there are less than max_files_per_block files to publish to
    # begin with. If there are more than max_files_per_block files to publish,
    # publish as many blocks as possible and leave the tail of files for the next
    # PublisherWorker call, unless forced to published.
    block_count = 0
    count = 0
    publishedBlocks = 0
    failedBlocks = 0
    max_files_per_block = config.General.max_files_per_block
    dumpList = []   # keep a list of files where blocks which fail publication are dumped
    while True:
        block_name = "%s#%s" % (dataset, str(uuid.uuid4()))
        files_to_publish = dbsFiles[count:count+max_files_per_block]
            block_config = {'block_name': block_name, 'origin_site_name': pnn, 'open_for_writing': 0}
            if verbose:
                msg = "Inserting files %s into block %s." % ([f['logical_file_name']
                                                              for f in files_to_publish], block_name)
            blockDump = createBulkBlock(output_config, processing_era_config,
                                        primds_config, dataset_config,
                                        acquisition_era_config, block_config, files_to_publish)
            #logger.debug("Block to insert: %s\n %s" % (blockDump, destApi.__dict__ ))

            if dryRun:
                logger.info("DryRun: skip insertBulkBlock")
            block_count += 1
            publishedBlocks += 1
        except Exception as ex:
            #logger.error("Error for files: %s" % [f['SourceLFN'] for f in toPublish])
            logger.error("Error for files: %s", [f['lfn'] for f in toPublish])
            failed.extend([f['SourceLFN'] for f in toPublish])
            #failed.extend([f['lfn'].replace("/store","/store/temp") for f in toPublish])
            msg = "Error when publishing (%s) " % ", ".join(failed)
            msg += str(ex)
            msg += str(traceback.format_exc())
            failure_reason = str(ex)
            taskFilesDir = config.General.taskFilesDir
            fname = os.path.join(taskFilesDir, 'FailedBlocks', 'failed-block-at-%s.txt' % time.time())
            with open(fname, 'w') as fd:
            failedBlocks += 1
            logger.error("FAILING BLOCK DUE TO %s SAVED AS %s", str(ex), fname)
        count += max_files_per_block
        files_to_publish_next = dbsFiles_f[count:count+max_files_per_block]
        if len(files_to_publish_next) < max_files_per_block:
            publish_in_next_iteration.extend([f["SourceLFN"] for f in files_to_publish_next])
            #publish_in_next_iteration.extend([f["lfn"].replace("/store","/store/temp") for f in files_to_publish_next])
    published = [x for x in published if x not in failed + publish_in_next_iteration]
    # Fill number of files/blocks published for this dataset.
    final['files'] = len(dbsFiles) - len(failed) - len(publish_in_next_iteration)
    final['blocks'] = block_count
    # Print a publication status summary for this dataset.
    msg = "End of publication status:"
    msg += " failed %s" % len(failed)
    if verbose:
        msg += ": %s" % failed
    msg += ", published %s" % len(published)
    if verbose:
        msg += ": %s" % published
    msg += ", publish_in_next_iteration %s" % len(publish_in_next_iteration)
    if verbose:
        msg += ": %s" % publish_in_next_iteration
    msg += ", results %s" % (final)

        if published:
            mark_good(published, crabServer, logger)
            data['workflow'] = taskname
            data['subresource'] = 'updatepublicationtime'
            crabServer.post(api='task', data=encodeRequest(data))
        if failed:
            logger.debug("Failed files: %s ", failed)
            mark_failed(failed, crabServer, logger, failure_reason)
    except Exception as ex:
        logger.exception("Status update failed: %s", ex)

    summary = {}
    summary['taskname'] = taskname
    summary['result'] = 'OK' if not failed else 'FAIL'
    summary['reason'] = '' if not failed else 'DBS Publication Failure'
    summary['publishedBlocks'] = publishedBlocks
    summary['failedBlocks'] = failedBlocks
    summary['failedBlockDumps'] = dumpList
    summary['publishedFiles'] = len(published)
    summary['failedFiles'] = len(failed)
    summary['nextIterFiles'] = len(publish_in_next_iteration)

    summaryFileName = saveSummaryJson(logdir, summary)

    return summaryFileName