Пример #1
0
def populateDB():
    listrses = list_rses({'T1': '1'})
    print len(listrses), listrses
    # listrses = list_rses()
    # print len(listrses), listrses
    # sys.exit()
    account = 'root'
    project = 'mc12_8TeV'

    dictDistrib = [{'datatype': 'HITS', 'prodstep': 'merge', 'nbfiles': 302, 'totfilesize': 225394185112, 'nbreplicas': 1}, {'datatype': 'HITS', 'prodstep': 'simul', 'nbfiles': 620, 'totfilesize': 97930909866, 'nbreplicas': 1},
                   {'datatype': 'EVNT', 'prodstep': 'evgen', 'nbfiles': 324, 'totfilesize': 7809298802, 'nbreplicas': 3}, {'datatype': 'AOD', 'prodstep': 'merge', 'nbfiles': 52, 'totfilesize': 106942334943, 'nbreplicas': 4},
                   {'datatype': 'AOD', 'prodstep': 'recon', 'nbfiles': 858, 'totfilesize': 182186965627, 'nbreplicas': 1}]

    for d in dictDistrib:
        for day in xrange(0, 180):
            for i in xrange(0, 30):
                scope = project
                prod_step = d['prodstep']
                datatype = d['datatype']
                nbfiles = int(d['nbfiles'])
                filesize = int(int(d['totfilesize'])/float(nbfiles))
                nbfiles = int(random.gauss(nbfiles, nbfiles/10))
                filesize = int(random.gauss(filesize, filesize/10))
                nbreplicas = int(d['nbreplicas'])
                dataset_meta = {'project': project, 'stream_name': 'dummy', 'prod_step': prod_step, 'datatype': datatype}
                source_rses = []
                if nbreplicas:
                    iter = 0
                    while (len(source_rses) != nbreplicas and iter != 100):
                        rnd_site = random.choice(listrses)
                        iter += 1
                        if rnd_site not in source_rses:
                            source_rses.append(rnd_site)

                    try:
                        dsn = '%s.%s.%s.%i.%i' % (project, prod_step, datatype, day, i)
                        print '%i Creating %s with %i files of size %i located at %i sites' % (i, dsn, nbfiles, filesize, len(source_rses))
                        add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}, meta=dataset_meta)
                        files = ['file_%s' % uuid() for i in xrange(nbfiles)]
                        listfiles = []
                        for file in files:
                            listfiles.append({'scope': scope, 'name': file, 'size': filesize})
                            for source_rse in source_rses:
                                add_file_replica(source_rse, scope, file, filesize, issuer=account)
                        attach_identifier(scope, name=dsn, dids=listfiles, issuer=account)
                        for source_rse in source_rses:
                            try:
                                add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=source_rse,
                                                     grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer='root')
                            except InvalidReplicationRule, e:
                                print e
                    except RucioException, e:
                        print e
Пример #2
0
def populateDB(filename=None):
    listrses = list_rses(filters={'deterministic': 1})
    listrses = map(lambda x: x['rse'], listrses)
    account = 'root'

    pdf = generatePDF()

    # Generate 200000 datasets according to the dataset distribution
    for index in xrange(0, 20000):
        scope_nb = getRandomScope(pdf)
        project = 'user.user%i' % (scope_nb)
        scope = 'user.user%i' % (scope_nb)
        account = 'user%i' % (scope_nb)
        print scope
        nbfiles = 53
        filesize = 78000000
        uid = uuid()
        dsn = '%s.%s' % (project, uid)
        rnd_site = random.choice(listrses)
        print '%i Creating %s with %i files of size %i located at %s' % (index, dsn, nbfiles, filesize, rnd_site)
        add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True})
        monitor.record(timeseries='dbfiller.addnewdataset',  delta=1)
        files = ['file_%s' % uuid() for i in xrange(nbfiles)]
        listfiles = []
        for file in files:
            listfiles.append({'scope': scope, 'name': file, 'size': filesize})
            add_file_replica(rnd_site, scope, file, filesize, issuer=account)
        monitor.record(timeseries='dbfiller.addreplicas',  delta=nbfiles)
        attach_identifier(scope, name=dsn, dids=listfiles, issuer=account)
        monitor.record(timeseries='dbfiller.addnewfile',  delta=nbfiles)
        try:
            add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=rnd_site,
                                 grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer=account)
            monitor.record(timeseries='dbfiller.addreplicationrules',  delta=1)
        except InvalidReplicationRule, e:
            print e
def populateDB(filename=None):
    listrses = list_rses(filters={'deterministic': 1})
    print listrses
    listrses = map(lambda x: x['rse'], listrses)
    account = 'root'
    nbDatasets = 0
    list = []
    dictDistrib = {}

    if not filename:
        if os.getenv('RUCIO_HOME'):
            filename = os.getenv('RUCIO_HOME') + '/etc/data12_8TeV_distribution.txt'
        else:
            filename = '/opt/rucio/etc/data12_8TeV_distribution.txt'

    # Get the dataset distribution
    f = open(filename, 'r')
    for line in f:
        if not line.startswith('NBDATASETS'):
            line = line.rstrip('\n')
            strsplit = line.split()
            dictDistrib[(nbDatasets, nbDatasets + int(strsplit[0]))] = strsplit[1:]
            nbDatasets += int(strsplit[0])
            list.append([nbDatasets, ] + strsplit[1:])
    f.close()

    # Generate 200000 datasets according to the dataset distribution
    for i in xrange(0, 200000):
        rnd = random.random() * nbDatasets
        for lower, upper in dictDistrib:
            if (rnd > lower) and (rnd < upper):
                project = dictDistrib[lower, upper][0]
                scope = project
                run_number = random.randint(0, 1000000)
                tag = random.randint(0, 10000)
                stream_name = dictDistrib[lower, upper][1]
                prod_step = dictDistrib[lower, upper][2]
                datatype = dictDistrib[lower, upper][3]
                provenance = dictDistrib[lower, upper][4]
                group = dictDistrib[lower, upper][5]
                if group == '/atlas/role=production':
                    # account = 'atlasprod'
                    account = 'panda'
                    if provenance == 'T0':
                        group = 'tier0'
                        account = 'tier0'
                    else:
                        group = 'panda'
                else:
                    # account = dictGroups[group]
                    account = 'panda'
                    scope = 'group.%s' % (dictGroups[group])
                    group = dictGroups[group]
                nbfiles = int(dictDistrib[lower, upper][6])
                filesize = int(int(dictDistrib[lower, upper][7])/float(nbfiles))
                nbreplicas = int(dictDistrib[lower, upper][8])
                if group == 'panda' or group == 'tier0':
                    dataset_meta = {'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group}
                else:
                    campaign = int(tag/1000.)
                    dataset_meta = {'project': project, 'run_number': run_number, 'stream_name': stream_name, 'prod_step': prod_step, 'datatype': datatype, 'provenance': provenance, 'group': group, 'campaign': '%s_repro_%i' % (group, campaign)}
                source_rses = []
                if nbreplicas:
                    iter = 0
                    while (len(source_rses) != nbreplicas and iter != 100):
                        rnd_site = random.choice(listrses)
                        iter += 1
                        if (rnd_site not in source_rses):
                            source_rses.append(rnd_site)

                    run_number_string = str(run_number)
                    run_number_string = run_number_string.rjust(7, '0')
                    dsn = '%s.%s.%s.%s.%s.%s' % (project, run_number_string, stream_name, prod_step, datatype, tag)
                    print '%i Creating %s:%s with %i files of size %i located at %i sites' % (i, scope, dsn, nbfiles, filesize, len(source_rses))
                    stime1 = time.time()
                    add_identifier(scope=scope, name=dsn, type='dataset', issuer=account, statuses={'monotonic': True}, meta=dataset_meta)
                    stime2 = time.time()
                    print 'Time to generate a dataset : %s' % str(stime2 - stime1)
                    monitor.record(timeseries='dbfiller.addnewdataset',  delta=1)
                    files = ['file_%s' % uuid() for i in xrange(nbfiles)]
                    listfiles = []
                    for file in files:
                        listfiles.append({'scope': scope, 'name': file, 'size': filesize})
                        for source_rse in source_rses:
                            add_file_replica(source_rse, scope, file, filesize, issuer=account)
                    stime3 = time.time()
                    print 'Time to create replicas : %s' % str(stime3 - stime2)
                    monitor.record(timeseries='dbfiller.addreplicas',  delta=nbfiles*len(source_rses))
                    attach_identifier(scope, name=dsn, dids=listfiles, issuer=account)
                    stime4 = time.time()
                    print 'Time to attach files : %s' % str(stime4 - stime3)
                    monitor.record(timeseries='dbfiller.addnewfile',  delta=nbfiles)
                    for source_rse in source_rses:
                        try:
                            add_replication_rule(dids=[{'scope': scope, 'name': dsn}], account=account, copies=1, rse_expression=source_rse,
                                                 grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, issuer='root')
                            monitor.record(timeseries='dbfiller.addreplicationrules',  delta=1)
                        except InvalidReplicationRule, e:
                            print e
                    stime5 = time.time()
                    print 'Time to attach files : %s' % str(stime5 - stime4)