Пример #1
0
    def setup(self):
        template = []
        cur_dir = os.path.dirname(__file__)
        yaml = os.path.join(cur_dir, 'cassandra.yaml')

        basedata = ''
        with open(yaml, 'r') as f:
            basedata = f.read()

        def write_yaml(vm):
            pernodedata = CassandraBase % (self.name, vm.intf_ip('eth0'),
                                           vm.intf_ip('eth0'), self.snitch)

            vm_parts = vm.data_directories()

            data_dirs = ("/data\n    - ").join(vm_parts)
            commit_dirs = vm_parts[0]
            if len(vm_parts) > 1:
                data_dirs = ("/data\n    - ").join(vm_parts[1:])

            conndata = CassandraTemplate % (','.join(
                self.seed_ips), data_dirs, commit_dirs)

            config = "\n".join([basedata, pernodedata, conndata])
            vm.script(
                'sudo cat <<EOT > {0}/conf/cassandra.yaml\n{1}\nEOT'.format(
                    CASSANDRA_PATH, config))

        parallel(write_yaml, self.nodes)
Пример #2
0
    def restart_hdfs(self):
        self.master.script('sudo service hadoop-hdfs-namenode restart')
        parallel(lambda vm: vm.script('sudo service hadoop-hdfs-datanode restart'), self.workers)

        self.master.script('sudo -u hdfs hdfs dfs -mkdir -p /tmp/hadoop-yarn')
        self.master.script('sudo -u hdfs hdfs dfs -chmod -R 1777 /tmp')
        self.master.script('sudo -u hdfs hdfs dfs -chmod -R 1777 /tmp/hadoop-yarn')
Пример #3
0
    def stop(self):
        def stop_entity(entity):
            if hasattr(entity, 'stop'):
                entity.stop()
            return True

        parallel(stop_entity, self._entities)
Пример #4
0
def tpch(vms, env):
    tpch_scale = int(env.param('tpch:scale'))
    hive = setup_hive(vms, env)
    hive.master.script(
        tpch_cmd('./tpch-setup.sh {0} >/dev/null 2>&1'.format(tpch_scale)))

    directory = 'tpch-' + hive.master.type + '-' + str(len(vms)) + "-results"
    makedirectory(directory)

    def execute_query(query):
        tpch_run_query(hive.master, query, tpch_scale)

    for iteration in range(1, int(env.param('tpch:runs'))):
        # Drop file caches to be more accurate for amount of reads and writes
        parallel(
            lambda vm: vm.script("sync; echo 3 > /proc/sys/vm/drop_caches"),
            vms)
        argos_start(vms, directory, iteration)
        start = time.time()
        parallel(execute_query, TPCH_QUERIES)
        end = time.time()
        argos_finish(vms, directory, iteration)

        file_name = str(time.time()) + '-' + hive.master.type
        with open(
                os.path.join(directory, str(iteration),
                             hive.master.type + '.time'), 'w+') as f:
            f.write('0,%s' % str(end - start))
Пример #5
0
    def setup(self):
        if not self.cloudera.install('Hadoop'):
            return False

        if not self.cloudera.install('Spark'):
            return False

        def install_hive(vm):
            for package_name in ClouderaHive.HivePackages:
                vm.package_manager.install(package_name)
        parallel(install_hive, self.nodes)

        def setup_mysql():
            vm = self.master
            sqlFile = '/usr/lib/hive/setup-mysql-cloudbench.sql'
            vm.install('mysql')
            vm.script(write_template('hive-mysql', sqlFile))
            vm.script('cat {0} | mysql -u root'.format(sqlFile))

        def setup_hive(vm):
            vm.script(write_template('hive-site', '/usr/lib/hive/conf/hive-site.xml',
                master=self.master.name))

        # Install mysql on the master
        setup_mysql()
        parallel(setup_hive, self.nodes)

        return True
Пример #6
0
    def run_on_testers(self, func):
        result = Queue.Queue()
        parallel(lambda vm: result.put(func(vm)), self._test_vms)

        out = []
        while not result.empty():
            out.append(result.get())
        return out
Пример #7
0
def monitor_start(vms):
    # Start IO monitor
    # parallel(lambda vm: vm.monitor(), vms)

    # Start Argos
    parallel(lambda vm: vm.script('rm -rf ~/argos/proc'), vms)
    parallel(lambda vm: vm.script('cd argos; sudo nohup src/argos >argos.out 2>&1 &'), vms)
    time.sleep(2)
Пример #8
0
    def format_hdfs(self):
        remove_hdfs_dir = self.hadoop_user_cmd('"rm -rf {0}/hdfs"'.format(
            self.hdfs_path(self.master)))
        parallel(lambda vm: vm.script(remove_hdfs_dir), self.all_nodes())

        remove_hdfs_dir = self.hadoop_user_cmd('"rm -rf {0}/tmp"'.format(
            self.hdfs_path(self.master)))
        parallel(lambda vm: vm.script(remove_hdfs_dir), self.all_nodes())

        self.master.script(
            self.hadoop_user_cmd('"hdfs namenode -format -force"'))
Пример #9
0
def tpch(vms, env):
    hive = setup_hive(env, vms)
    hive.master.script(tpch_cmd('./tpch-setup.sh {0}'.format(TPCH_SCALE)))

    def execute_query(num):
        tpch_run_query(hive.master, num, TPCH_SCALE)

    start = time.time()
    parallel(execute_query, TPCH_QUERIES)
    end = time.time()
    print "Total time: %.2f" % (end - start)
Пример #10
0
def setup_spark(vms, env):
    setup_disks(vms, env)
    setup_base(vms, env)

    ce = Cloudera(vms)
    ce.install('Hadoop')
    ce.install('Spark')

    # Make sure spark can be written by anyone
    parallel(lambda vm: vm.script('chown -R ubuntu:ubuntu /var/lib/spark/work'), vms)
    parallel(lambda vm: vm.script('sudo -u hdfs hdfs dfs -chmod 777 /user/spark'), vms)

    return ce['Spark']
Пример #11
0
def setup_disks(env, vms):
    def setup_vm_disks(vm):
        root = vm.root_disk()
        disks = vm.disks()
        disk_id = 2

        for disk in disks:
            if root.startswith(disk):
                continue
            vm.mount(disk, '/data/%d' % disk_id, force_format=True)
            disk_id += 1

    parallel(setup_vm_disks, vms)
Пример #12
0
def terasort_with_argos_run(vms, env):
    parallel(lambda vm: vm.install('hadoop'), vms)
    parallel(lambda vm: vm.install('ntp'), vms)
    parallel(lambda vm: vm.install('argos'), vms)
    parallel(lambda vm: vm.install('jq'), vms)

    cluster = HadoopCluster(vms[0], vms[1:], env.param('terasort:use_local_disk') != 'False')
    cluster.setup()
    cluster.reset()

    output = cluster.execute('"/usr/bin/time -f \'%e\' -o terasort.out hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar teragen -Dmapred.map.tasks={1} {2} {0}"'.format(TERASORT_INPUT, env.param('terasort:mappers'), env.param('terasort:rows')))
    teragen_time = cluster.master.script('sudo su - hduser -c "tail -n1 terasort.out"').strip()

    argos_start(vms)

    cluster.execute('"/usr/bin/time -f \'%e\' -o terasort.out hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar terasort -Dmapred.reduce.tasks={2} {0} {1} >output.log 2>&1"'.format(TERASORT_INPUT, TERASORT_OUTPUT, env.param('terasort:reducers')))

    argos_finish(vms)

    collect_terasort_stats(vms)

    terasort_time = cluster.master.script('sudo su - hduser -c "tail -n1 terasort.out"').strip()
    terasort_out = cluster.master.script('sudo su - hduser -c "cat output.log"').strip()

    file_name = str(time.time()) + '-' + cluster.master.type
    with open(file_name + ".time", 'w+') as f:
        f.write(str(teragen_time) + "," + str(terasort_time))

    with open(file_name + ".out", 'w+') as f:
        f.write(terasort_out)
Пример #13
0
    def create_security_group(self, ep):
        """ Create endpoints in the microsoft terms """
        ret = True

        # TODO: Can parallelize here
        def create_endpoint(vm):
            cmd = ['azure', 'vm', 'endpoint', 'create']
            cmd += [self.unique(vm), ep.public_port, ep.private_port]
            cmd += ['--name', self.unique(ep.name)[-15:]
                    ]  # Endpoint name should be at most 15 characters
            cmd += ['--protocol', ep.protocol]
            self.execute(cmd)

        parallel(create_endpoint, ep.virtual_machines())
        return ret
Пример #14
0
    def __init__(self, master_, slaves_, local_disk_=True):
        self.master = master_
        self.slaves_ = slaves_
        self.local_disk_ = local_disk_

        super(HadoopCluster, self).__init__(self.all_nodes(), HADOOP_USER)

        def setup_hdfs_permissions(vm):
            path = self.hdfs_path(vm)
            if 'home' not in path:
                vm.mount('/dev/xvdb', path, 'ext4', True)
                vm.script('chown -R %s:%s %s' %
                          (HADOOP_USER, HADOOP_GROUP, path))
                vm.script("chmod -R 755 %s" % path)

        parallel(setup_hdfs_permissions, self.all_nodes())
Пример #15
0
def tpch(vms, env):
    hive = setup_hive(vms, env)
    parallel(lambda vm: vm.install('tpch'), vms)

    hive.master.script(tpch_cmd('./tpch-setup.sh {0}'.format(TPCH_SCALE)))

    argos_start(vms)
    start = time.time()
    for query in TPCH_QUERIES:
        tpch_run_query(hive.master, query, TPCH_SCALE)
    end = time.time()
    argos_finish(vms)

    file_name = str(time.time()) + '-' + hive.master.type
    with open(file_name + '.time', 'w+') as f:
        f.write(str(end - start))
Пример #16
0
def setup_disks(vms, env):
    def setup_vm_disks(vm):
        vm.script('rm -rf /data/1/')
        root = vm.root_disk()
        disks = vm.disks()
        disk_id = 2

        if len(disks) == 0 or vm.type == 'i2.8xlarge':
            disks = vm.local_disks_except_root()

        for disk in disks:
            if root.startswith(disk):
                continue
            vm.mount(disk, '/data/%d' % disk_id, force_format=True)
            disk_id += 1
    parallel(setup_vm_disks, vms)
Пример #17
0
    def setup_directories(self):
        def create_yarn_dfs_folders(vm):
            if len(vm.data_directories()) == 0:
                vm.script("mkdir -p /data/1/")

            for dd in vm.data_directories():
                vm.script("rm -r {base}/yarn".format(base=dd))
                vm.script("rm -r {base}/dfs".format(base=dd))

                vm.script("mkdir -p {base}/yarn/logs".format(base=dd))
                vm.script("mkdir -p {base}/yarn/local".format(base=dd))
                vm.script("chown -R yarn:yarn {base}/yarn".format(base=dd))

                vm.script("mkdir -p {base}/dfs/nn".format(base=dd))
                vm.script("mkdir -p {base}/dfs/dn".format(base=dd))
                vm.script("chown -R hdfs:hdfs {base}/dfs".format(base=dd))
        parallel(create_yarn_dfs_folders, self.nodes)
Пример #18
0
    def traverse_dag(self, check, execute, direction='dependencies'):
        """ Traverse from the leaves upward to root, making sure all
        the leaves of a node have executed the "execute" function before
        the node is executed """
        def satisfied(ent):
            """ Returns true if the requirement of an entity are
            satisfied """
            for dep in getattr(ent, direction):
                deps = getattr(ent, dep)()
                if not deps:
                    continue

                if not isinstance(deps, list):
                    deps = [deps]

                # if any of the dependencies are not satisfied, return False
                if any(map(lambda x: not check(x), deps)):
                    return False

            return True

        # Collect all entities
        everything = set()
        for ent in self.entities().values():
            everything = everything.union(set(ent.values()))

        while everything:
            to_remove = set()
            to_execute = set()
            lock = RLock()

            def satisfy(x):
                if satisfied(x):
                    if not check(x):
                        with lock:
                            to_execute.add(x)
                    else:
                        with lock:
                            to_remove.add(x)

            parallel(satisfy, everything)
            parallel(lambda x: execute(x), to_execute)
            to_remove = to_remove.union(
                set(filter(lambda x: check(x), to_execute)))
            everything = everything - to_remove
Пример #19
0
def run_ycsb(vms, env, cluster, workload, record_count, operation_count):
    lock = RLock()
    insert_start = [0]
    insert_count = [record_count / len(vms)]
    op_count = [operation_count / len(vms)]

    def run_workload(vm):
        start = 0
        with lock:
            start = insert_start[0]
            insert_start[0] += insert_count[0]

        cmd = "./bin/ycsb run cassandra2-cql -P workloads/workload{3} -p hosts='{0}' -p recordcount={1} -p operationcount={2} -p insertstart={4} -p insertcount={5} -s -threads 1000 >~/run.log 2>&1"
        cmd = cmd.format(','.join(cluster.node_ip_list()), record_count,
                         op_count[0], workload, start, insert_count[0])
        vm.script("cd {0} && {1}".format(YCSB_PATH, cmd))

    parallel(run_workload, vms)
Пример #20
0
def setup_hive(vms, env):
    parallel(lambda vm: vm.install('hadoop'), vms)
    parallel(lambda vm: vm.install('hive'), vms)
    parallel(lambda vm: vm.install('mahout'), vms)
    parallel(lambda vm: vm.install('bigbench'), vms)
    parallel(lambda vm: vm.install('argos'), vms)

    vms[0].install('bigbench')

    hadoop = HadoopCluster(vms[0], vms[1:],
                           env.param('terasort:use_local_disk'))
    hadoop.setup()
    hadoop.reset()

    hive = HiveCluster(hadoop)
    hive.setup()

    return hive
Пример #21
0
def terasort(vms, env):
    hadoop = setup_hadoop(env, vms)
    print "Master is: %s" % hadoop.master.name

    directory = 'terasort-' + hadoop.master.type + '-' + str(
        len(vms)) + "-results"
    makedirectory(directory)
    iteration = str(1)

    extra_teragen_params = "-Ddfs.blocksize=512M -Dmapreduce.task.io.sort.mb=256"

    hadoop.master.execute("sudo service hadoop-hdfs-namenode restart")
    hadoop.master.execute("sudo service hadoop-hdfs-datanode restart")
    hadoop.master.execute("sudo service hadoop-yarn-resourcemanager restart")

    mapper_count = int(4 * int(sum(map(lambda vm: vm.cpus(), vms))) * 0.8)
    hadoop.execute(
        'sudo -u hdfs hadoop jar /usr/lib/hadoop-0.20-mapreduce/hadoop-examples-2.6.0-mr1-cdh5*.jar teragen {2} -D mapred.map.tasks={0} {1} /terasort-input'
        .format(mapper_count, env.param('terasort:rows'),
                extra_teragen_params))

    # Drop file caches to be more accurate for amount of reads and writes
    parallel(lambda vm: vm.script("sync; echo 3 > /proc/sys/vm/drop_caches"),
             vms)

    reducer_count = int(sum(map(lambda vm: vm.cpus(), vms)) * 0.8)

    extra_terasort_params = "-Ddfs.blocksize=512M -Dmapreduce.task.io.sort.factor=100 -Dmapreduce.task.io.sort.mb=384 -Dio.file.buffer.size=131072"
    monitor_start(vms)
    hadoop.execute(
        '/usr/bin/time -f \'%e\' -o terasort.out sudo -u hdfs hadoop jar /usr/lib/hadoop-0.20-mapreduce/hadoop-examples-2.6.0-mr1-cdh5*.jar terasort {1} -D mapred.reduce.tasks={0} /terasort-input /terasort-output >output.log 2>&1'
        .format(str(reducer_count), extra_terasort_params))
    monitor_finish(vms, directory, iteration)

    terasort_time = hadoop.master.script('tail -n1 terasort.out').strip()
    terasort_out = hadoop.master.script('cat output.log').strip()
    file_name = hadoop.master.type
    with open(os.path.join(directory, str(iteration), file_name + ".time"),
              'w+') as f:
        f.write("0," + str(terasort_time))

    with open(os.path.join(directory, str(iteration), file_name + ".out"),
              'w+') as f:
        f.write(terasort_out)
Пример #22
0
def setup_spark_perf(env, vms):
    path = Config.path('tools', 'spark-perf.tar.gz')
    parallel(lambda vm: vm.send(path, '/home/ubuntu'), vms)
    parallel(lambda vm: vm.script('rm -rf /home/ubuntu/spark-perf'), vms)
    parallel(lambda vm: vm.script('tar -xzf spark-perf.tar.gz'), vms)
    num_cores = len(vms) * vms[0].cpus()

    def replace_line(vm):
        vm.script(
            "cd spark-perf; sed -i '/OptionSet(\"num-partitions\", \[128\], can_scale=True),/c\    OptionSet(\"num-partitions\", [%d], can_scale=False),' config/config.py"
            % num_cores)

    parallel(replace_line, vms)
Пример #23
0
    def setup_core_site(self):
        config = """
           <property>
             <name>hadoop.tmp.dir</name>
             <value>file://{0}/tmp</value>
             <description>Temporary Directory.</description>
           </property>

           <property>
             <name>fs.defaultFS</name>
             <value>hdfs://{1}:54310</value>
             <description>Use HDFS as file storage engine</description>
           </property> 
        """

        config = CoreSiteTemplate.format(
            config.format(self.hdfs_path(self.master), self.master.name))
        command = modify_hadoop_config(config, '/etc/hadoop/core-site.xml')

        # Upload the file in parallel
        parallel(lambda node: node.script(command), self.all_nodes())
Пример #24
0
    def start_entities(self, entities):
        """ Start the entities for a job """
        dead_entities = set()
        lock = RLock()

        def entity_up(entity):
            if not isinstance(entity, Preemptable):
                return True

            entity.start()
            entity.wait(180)

            if entity.stale:
                with lock:
                    dead_entities.add(entity)

        parallel(entity_up, entities)

        if len(dead_entities) > 0:
            self.add_dead_entities(dead_entities)
            return False

        return True
Пример #25
0
    def setup(self):
        hadoop = self.cloudera.install('Hadoop')
        if not hadoop:
            return False

        def install_spark(vm):
            for package_name in ClouderaSpark.SparkPackages:
                vm.package_manager.install(package_name)
        parallel(install_spark, self.nodes)

        hadoop.execute('sudo -u hdfs hdfs dfs -mkdir -p /user/spark')
        hadoop.execute('sudo -u hdfs hdfs dfs -mkdir -p /user/spark/share/lib')
        hadoop.execute('sudo -u hdfs hdfs dfs -mkdir -p /user/spark/applicationHistory')
        hadoop.execute('sudo -u hdfs hdfs dfs -chown -R spark:spark /user/spark')
        hadoop.execute('sudo -u hdfs hdfs dfs -chmod 1777 /user/spark/applicationHistory')

        per_node_cpu= self.master.cpus()
        cluster_cpu = per_node_cpu* len(self.nodes)
        total_memory = int(self.master.memory() / (1024 * 1024)) - 1024

        # executor_memory = int(total_memory/(per_node_cpu*1024))
        # executor_count = cluster_cpu
        # executor_cores = 1

        executor_count = cluster_cpu
        executor_cores = 1
        executor_memory = int(math.ceil(total_memory*executor_cores/per_node_cpu)*0.5)#int(math.ceil((total_memory - 5.0*1024/len(self.nodes) - 1024)*0.90))

        self.master.script(write_template('spark-defaults.conf',
            '/etc/spark/conf/spark-defaults.conf',
            master=self.master.name,
            instances=executor_count,
            cores=executor_cores,
            memory=(str(executor_memory) + 'm')))

        self.master.script('sudo service spark-history-server restart')
        return True
Пример #26
0
    def setup_hdfs_site(self):
        dirs = ["{0}/hdfs/datanode", "{0}/hdfs/namenode"]

        def create_hdfs_dirs(vm):
            for d in map(lambda x: x.format(self.hdfs_path(vm)), dirs):
                vm.script('sudo su - {0} -c "mkdir -p {1}"'.format(
                    HADOOP_USER, d))

        parallel(create_hdfs_dirs, self.all_nodes())

        config = """
<property>
 <name>dfs.replication</name>
 <value>1</value>
 <description>Default block replication.
  The actual number of replications can be specified when the file is created.
  The default is used if replication is not specified in create time.
 </description>
</property>
<property>
 <name>dfs.namenode.name.dir</name>
 <value>{0}/hdfs/namenode</value>
 <description>Determines where on the local filesystem the DFS name node should store the name table(fsimage). If this is a comma-delimited list of directories then the name table is replicated in all of the directories, for redundancy.
 </description>
</property>
<property>
 <name>dfs.datanode.data.dir</name>
 <value>{0}/hdfs/datanode</value>
 <description>Determines where on the local filesystem an DFS data node should store its blocks. If this is a comma-delimited list of directories, then data will be stored in all named directories, typically on different devices. Directories that do not exist are ignored.
 </description>
</property>
"""
        config = HdfsSiteTemplate.format(
            config.format(self.hdfs_path(self.master)))
        command = modify_hadoop_config(config, '/etc/hadoop/hdfs-site.xml')
        parallel(lambda vm: vm.script(command), self.all_nodes())