Пример #1
0
def runBigPerou():

    nCores = cluster.getNCores()
    nSlaves = cluster.getNSlaves()
    memory = cluster.getMemory()

    maxSlaves = int(nCores / 2)

    # cluster.runOnMasterX(
    #    'python2.7 irt/processor/scripts/prodLauncher.py write perou 2000 P')

    # cluster.runOnMasterX('hdfs dfs -put $PYIRT_DIR_WORK/perou-T2000-BP* /')

    command = ' '.join([
        'spark-submit',
        '--jars /mnt/data/package/softs/jars/AvroToPythonConverters-1.3.0.jar',
        'irt/processor/scripts/prodLauncher.py',
        'runSparkAvro perou-T2000-BP',
        '--mode cluster',
        '--block 1000',
        '--band P',
        '--res 1',
        # '--modeStore memory',
        '--tile 2000',
        '--nbThreads 2',
        '--nbSlaves {0}'.format(nSlaves),
        '--maxSlaves {0}'.format(maxSlaves),
        '--modeProduction L2area'])

    cluster.runOnMasterX(command)
Пример #2
0
def runProduction():

    nCores = cluster.getNCores()
    nSlaves = cluster.getNSlaves()
    maxSlaves = int(nCores / 2)
    memory = cluster.getMemory()

    cluster.runOnMaster('hdfs dfs -rm -r -f /brisbane1-T4000-BP__BP_R150cm_T2000.avro')

    command = ' '.join([
        'spark-submit',
        # '--jars /mnt/data/package/softs/jars/AvroToPythonConverters-1.3.0.jar',
        '--jars /mnt/data/package/softs/jars/spark-avro_2.10-2.0.1-SNAPSHOT.jar',
        'irt/processor/scripts/prodLauncher.py',
        'runSparkAvro brisbane1-T4000-BP',
        '--mode cluster',
        '--block 1000',
        '--band P',
        '--res 1.5',
        '--modeStore memory',
        '--tile 2000',
        '--nbThreads 2',
        '--nbSlaves {0}'.format(nSlaves),
        '--maxSlaves {0}'.format(maxSlaves),
        '--modeProduction L2area'])

    cluster.runOnMasterX(command)
Пример #3
0
def setupFullWithFormat():

    if cluster.getNSlaves() > 0:
        setupSoftFiles()
        setupConfigurationFiles()
        setupBashrc()
        formatFileSystem()
        startDfs()
Пример #4
0
def setupBashrc():

    inStartup = utils.getTemplateFile('spark-setup_spark.sh')
    outStartup = utils.getLocalTempFile('setup_spark.sh')

    utils.stringReplaceInFile(
        inStartup,
        outStartup,
        {
            'XXSPARKLOCALPATHXX': "'{0}'".format(softdir)
        })

    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outStartup, '~/setup_spark.sh')

    command = 'echo "source setup_spark.sh" >> setup.sh'
    cluster.runOnAllNodesAsync(command)

    if useCloudConnector and cluster.getNSlaves() > 0:
        addjars('{0}/lib/gcs-connector-latest-hadoop2.jar'.format(softdir))
Пример #5
0
def setupConfigurationFiles():
    """Deploy spark configuration files"""

    (listnodes, nodes) = cluster.instanceListAll()

    # We create here a fake python link.
    # This python is used as the main spark driver
    # If we need to change the spark python driver,
    # we just have to overwrite this link.

    cluster.runOnAllNodesAsync('ln -fs `which python` {0}'.format(pathpython))

    # -------------------------------------------
    # handling of slaves
    # -------------------------------------------
    # The slave file contains information
    # about which hosts have to be used
    outSlave = utils.getLocalTempFile('slaves')

    with open(outSlave, 'w') as streamOut:
        for namenode in listnodes['slaves']:
            streamOut.write(namenode + '\n')

    # -------------------------------------------
    # handling of spark configuration
    # -------------------------------------------

    if cluster.getNSlaves() > 0:
        sparkMaster = 'spark://{0}:7077'.format(cluster.getMasterName())
    else:
        sparkMaster = 'local[{0}]'.format((cluster.nCores - 1))

    inConf = utils.getTemplateFile('spark-spark-defaults.conf')
    outConf = utils.getLocalTempFile('spark-defaults.conf')

    maxSlaves = int(cluster.nCores - 1)

    utils.stringReplaceInFile(
        inConf,
        outConf,
        {
            'XX-DRIVER-MEM-XX': mastermemory,
            'XX-EXECUTOR-MEM-XX': executormemory,
            'XX-SPARKMASTER-XX': sparkMaster,
            'XX-LOCAL-DIR-XX': localtempdir,
            'XX-CORES-XX': '{0}'.format(maxSlaves)
        })

    inEnv = utils.getTemplateFile('spark-spark-env.sh')
    outEnv = utils.getLocalTempFile('spark-env.sh')

    utils.stringReplaceInFile(
        inEnv,
        outEnv,
        {
            'XX-PYSPARK_PYTHON-XX': '"{0}"'.format(pathpython),
            'XX-SPARKMASTER-XX': sparkMaster,
            'XX-PYSPARK_DRIVER_PYTHON-XX': '"{0}"'.format(pathpython),
            'XX-PYTHONPATH-XX': '"{0}"'.format(pythonpath),
            'XX-LOCAL-DIR-XX': localtempdir,
            'XX-MASTER-IP-XX': '"{0}"'.format(cluster.ipGetMaster())
        })

    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outSlave, softdir + '/conf/slaves')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outConf, softdir + '/conf/spark-defaults.conf')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outEnv, softdir + '/conf/spark-env.sh')
    cluster.runOnAllNodesAsync('mkdir -p /tmp/spark-events')

    # we remove info level display from spark...
    cluster.runOnMaster(
        'sed -i "s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g" {0}/conf/log4j.properties.template'.format(softdir))
    cluster.runOnMaster(
        'cp {0}/conf/log4j.properties.template {0}/conf/log4j.properties'.format(softdir))

    # --------------------------------
    # handling of connector
    # --------------------------------
    # We install here the hadoop connector for google cloud storage
    # This connector permits writing data on google cs directly
    # from spark

    if useCloudConnector and cluster.getNSlaves() > 0:
        cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector))

        # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar')
        cluster.runOnAllNodesAsync(
            'cp gcs-connector-latest-hadoop2.jar {0}/lib/'.format(softdir))
        cluster.runOnAllNodesAsync(
            'cp {0}/etc/hadoop/core-site.xml {1}/conf/'.format(hadoop.softdir, softdir))

    # ------------------------------------------
    # Deployment of spark overloading scripts
    # ------------------------------------------
    #
    # One problem with spark in standalone mode, is that
    # we have to use the client mode.
    # with the client mode, we cannot use the spark default conf
    # for setting additional jars at launch.
    #
    # We therefore use two scripts, one for spark-submit, one for pyspark
    # for overloading the calls.
    #
    # These scripts tests for the existence of jar variables
    # and make the call accordingly

    inPyspark = utils.getTemplateFile('pyspark-jars')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(inPyspark, 'pyspark-jars')

    inSubmit = utils.getTemplateFile('spark-submit-jars')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(inSubmit, 'spark-submit-jars')