Exemplo n.º 1
0
    def _copy_files(self, deployment_dir, oozie_xml, oozie_properties):
        """
    Copy XML and the jar_path files from Java or MR actions to the deployment directory.
    This should run as the workflow user.
    """

        self._create_file(deployment_dir, self.job.XML_FILE_NAME, oozie_xml)
        self._create_file(deployment_dir,
                          'job.properties',
                          data='\n'.join([
                              '%s=%s' % (key, val)
                              for key, val in oozie_properties.iteritems()
                          ]))

        # List jar files
        files = []
        lib_path = self.fs.join(deployment_dir, 'lib')
        if hasattr(self.job, 'nodes'):
            for node in self.job.nodes:
                jar_path = node.data['properties'].get('jar_path')
                if jar_path:
                    if not jar_path.startswith(
                            '/'):  # If workspace relative path
                        jar_path = self.fs.join(self.job.deployment_dir,
                                                jar_path)
                    if not jar_path.startswith(
                            lib_path):  # If not already in lib
                        files.append(jar_path)

        if USE_LIBPATH_FOR_JARS.get():
            # Add the jar files to the oozie.libpath
            if files:
                files = list(set(files))
                LOG.debug("Adding to oozie.libpath %s" % files)
                if self.properties.get('oozie.libpath'):
                    files.append(self.properties['oozie.libpath'])
                self.properties['oozie.libpath'] = ','.join(files)
        else:
            # Copy the jar files to the workspace lib
            if files:
                for jar_file in files:
                    LOG.debug("Updating %s" % jar_file)
                    jar_lib_path = self.fs.join(lib_path,
                                                self.fs.basename(jar_file))
                    # Refresh if needed
                    if self.fs.exists(jar_lib_path) and self.fs.exists(
                            jar_file):
                        stat_src = self.fs.stats(jar_file)
                        stat_dest = self.fs.stats(jar_lib_path)
                        if stat_src.fileId != stat_dest.fileId:
                            self.fs.remove(jar_lib_path, skip_trash=True)
                    self.fs.copyfile(jar_file, jar_lib_path)
Exemplo n.º 2
0
  def _copy_files(self, deployment_dir, oozie_xml, oozie_properties):
    """
    Copy XML and the jar_path files from Java or MR actions to the deployment directory.
    This should run as the workflow user.
    """

    self._create_file(deployment_dir, self.job.XML_FILE_NAME, oozie_xml)
    self._create_file(deployment_dir, 'job.properties', data='\n'.join(['%s=%s' % (key, val) for key, val in oozie_properties.iteritems()]))

    # List jar files
    files = []
    lib_path = self.fs.join(deployment_dir, 'lib')
    if hasattr(self.job, 'nodes'):
      for node in self.job.nodes:
        jar_path = node.data['properties'].get('jar_path')
        if jar_path:
          if not jar_path.startswith('/'): # If workspace relative path
            jar_path = self.fs.join(self.job.deployment_dir, jar_path)
          if not jar_path.startswith(lib_path): # If not already in lib
            files.append(jar_path)

    if USE_LIBPATH_FOR_JARS.get():
      # Add the jar files to the oozie.libpath
      if files:
        files = list(set(files))
        LOG.debug("Adding to oozie.libpath %s" % files)
        if self.properties.get('oozie.libpath'):
          files.append(self.properties['oozie.libpath'])
        self.properties['oozie.libpath'] = ','.join(files)
    else:
      # Copy the jar files to the workspace lib
      if files:
        for jar_file in files:
          LOG.debug("Updating %s" % jar_file)
          jar_lib_path = self.fs.join(lib_path, self.fs.basename(jar_file))
          # Refresh if needed
          if self.fs.exists(jar_lib_path) and self.fs.exists(jar_file):
            stat_src = self.fs.stats(jar_file)
            stat_dest = self.fs.stats(jar_lib_path)
            if stat_src.fileId != stat_dest.fileId:
              self.fs.remove(jar_lib_path, skip_trash=True)
          self.fs.copyfile(jar_file, jar_lib_path)
Exemplo n.º 3
0
def test_copy_files():
    cluster = pseudo_hdfs4.shared_cluster()

    try:
        c = make_logged_in_client()
        user = User.objects.get(username='******')
        ensure_home_directory(cluster.fs, user)

        prefix = '/tmp/test_copy_files'

        if cluster.fs.exists(prefix):
            cluster.fs.rmtree(prefix)

        # Jars in various locations
        deployment_dir = '%s/workspace' % prefix
        external_deployment_dir = '%s/deployment' % prefix
        jar_1 = '%s/udf1.jar' % prefix
        jar_2 = '%s/lib/udf2.jar' % prefix
        jar_3 = '%s/udf3.jar' % deployment_dir
        jar_4 = '%s/lib/udf4.jar' % deployment_dir  # Doesn't move
        jar_5 = 'udf5.jar'
        jar_6 = 'lib/udf6.jar'  # Doesn't move

        cluster.fs.mkdir(prefix)
        cluster.fs.create(jar_1)
        cluster.fs.create(jar_2)
        cluster.fs.create(jar_3)
        cluster.fs.create(jar_4)
        cluster.fs.create(deployment_dir + '/' + jar_5)
        cluster.fs.create(deployment_dir + '/' + jar_6)

        class MockJob(object):
            XML_FILE_NAME = 'workflow.xml'

            def __init__(self):
                self.deployment_dir = deployment_dir
                self.nodes = [
                    Node({
                        'id': '1',
                        'type': 'mapreduce',
                        'properties': {
                            'jar_path': jar_1
                        }
                    }),
                    Node({
                        'id': '2',
                        'type': 'mapreduce',
                        'properties': {
                            'jar_path': jar_2
                        }
                    }),
                    Node({
                        'id': '3',
                        'type': 'java',
                        'properties': {
                            'jar_path': jar_3
                        }
                    }),
                    Node({
                        'id': '4',
                        'type': 'java',
                        'properties': {
                            'jar_path': jar_4
                        }
                    }),

                    # Workspace relative paths
                    Node({
                        'id': '5',
                        'type': 'java',
                        'properties': {
                            'jar_path': jar_5
                        }
                    }),
                    Node({
                        'id': '6',
                        'type': 'java',
                        'properties': {
                            'jar_path': jar_6
                        }
                    })
                ]

        submission = Submission(user,
                                job=MockJob(),
                                fs=cluster.fs,
                                jt=cluster.jt)

        submission._copy_files(deployment_dir, "<xml>My XML</xml>",
                               {'prop1': 'val1'})
        submission._copy_files(external_deployment_dir, "<xml>My XML</xml>",
                               {'prop1': 'val1'})

        assert_true(cluster.fs.exists(deployment_dir + '/workflow.xml'),
                    deployment_dir)
        assert_true(cluster.fs.exists(deployment_dir + '/job.properties'),
                    deployment_dir)

        # All sources still there
        assert_true(cluster.fs.exists(jar_1))
        assert_true(cluster.fs.exists(jar_2))
        assert_true(cluster.fs.exists(jar_3))
        assert_true(cluster.fs.exists(jar_4))
        assert_true(cluster.fs.exists(deployment_dir + '/' + jar_5))
        assert_true(cluster.fs.exists(deployment_dir + '/' + jar_6))

        # Lib
        deployment_dir = deployment_dir + '/lib'
        external_deployment_dir = external_deployment_dir + '/lib'

        if USE_LIBPATH_FOR_JARS.get():
            assert_true(jar_1 in submission.properties['oozie.libpath'])
            assert_true(jar_2 in submission.properties['oozie.libpath'])
            assert_true(jar_3 in submission.properties['oozie.libpath'])
            assert_true(jar_4 in submission.properties['oozie.libpath'])
            print(deployment_dir + '/' + jar_5)
            assert_true((deployment_dir + '/' + jar_5)
                        in submission.properties['oozie.libpath'],
                        submission.properties['oozie.libpath'])
            assert_true((deployment_dir + '/' + jar_6)
                        in submission.properties['oozie.libpath'],
                        submission.properties['oozie.libpath'])
        else:
            list_dir_workspace = cluster.fs.listdir(deployment_dir)
            list_dir_deployement = cluster.fs.listdir(external_deployment_dir)

            # All destinations there
            assert_true(cluster.fs.exists(deployment_dir + '/udf1.jar'),
                        list_dir_workspace)
            assert_true(cluster.fs.exists(deployment_dir + '/udf2.jar'),
                        list_dir_workspace)
            assert_true(cluster.fs.exists(deployment_dir + '/udf3.jar'),
                        list_dir_workspace)
            assert_true(cluster.fs.exists(deployment_dir + '/udf4.jar'),
                        list_dir_workspace)
            assert_true(cluster.fs.exists(deployment_dir + '/udf5.jar'),
                        list_dir_workspace)
            assert_true(cluster.fs.exists(deployment_dir + '/udf6.jar'),
                        list_dir_workspace)

            assert_true(
                cluster.fs.exists(external_deployment_dir + '/udf1.jar'),
                list_dir_deployement)
            assert_true(
                cluster.fs.exists(external_deployment_dir + '/udf2.jar'),
                list_dir_deployement)
            assert_true(
                cluster.fs.exists(external_deployment_dir + '/udf3.jar'),
                list_dir_deployement)
            assert_true(
                cluster.fs.exists(external_deployment_dir + '/udf4.jar'),
                list_dir_deployement)
            assert_true(
                cluster.fs.exists(external_deployment_dir + '/udf5.jar'),
                list_dir_deployement)
            assert_true(
                cluster.fs.exists(external_deployment_dir + '/udf6.jar'),
                list_dir_deployement)

            stats_udf1 = cluster.fs.stats(deployment_dir + '/udf1.jar')
            stats_udf2 = cluster.fs.stats(deployment_dir + '/udf2.jar')
            stats_udf3 = cluster.fs.stats(deployment_dir + '/udf3.jar')
            stats_udf4 = cluster.fs.stats(deployment_dir + '/udf4.jar')
            stats_udf5 = cluster.fs.stats(deployment_dir + '/udf5.jar')
            stats_udf6 = cluster.fs.stats(deployment_dir + '/udf6.jar')

            submission._copy_files('%s/workspace' % prefix,
                                   "<xml>My XML</xml>", {'prop1': 'val1'})

            assert_not_equal(
                stats_udf1['fileId'],
                cluster.fs.stats(deployment_dir + '/udf1.jar')['fileId'])
            assert_not_equal(
                stats_udf2['fileId'],
                cluster.fs.stats(deployment_dir + '/udf2.jar')['fileId'])
            assert_not_equal(
                stats_udf3['fileId'],
                cluster.fs.stats(deployment_dir + '/udf3.jar')['fileId'])
            assert_equal(
                stats_udf4['fileId'],
                cluster.fs.stats(deployment_dir + '/udf4.jar')['fileId'])
            assert_not_equal(
                stats_udf5['fileId'],
                cluster.fs.stats(deployment_dir + '/udf5.jar')['fileId'])
            assert_equal(
                stats_udf6['fileId'],
                cluster.fs.stats(deployment_dir + '/udf6.jar')['fileId'])

        # Test _create_file()
        submission._create_file(deployment_dir, 'test.txt', data='Test data')
        assert_true(cluster.fs.exists(deployment_dir + '/test.txt'),
                    list_dir_workspace)

    finally:
        try:
            cluster.fs.rmtree(prefix)
        except:
            LOG.exception('failed to remove %s' % prefix)
Exemplo n.º 4
0
def test_copy_files():
  cluster = pseudo_hdfs4.shared_cluster()

  try:
    c = make_logged_in_client()
    user = User.objects.get(username='******')

    prefix = '/tmp/test_copy_files'

    if cluster.fs.exists(prefix):
      cluster.fs.rmtree(prefix)

    # Jars in various locations
    deployment_dir = '%s/workspace' % prefix
    external_deployment_dir = '%s/deployment' % prefix
    jar_1 = '%s/udf1.jar' % prefix
    jar_2 = '%s/lib/udf2.jar' % prefix
    jar_3 = '%s/udf3.jar' % deployment_dir
    jar_4 = '%s/lib/udf4.jar' % deployment_dir # Doesn't move
    jar_5 = 'udf5.jar'
    jar_6 = 'lib/udf6.jar' # Doesn't move

    cluster.fs.mkdir(prefix)
    cluster.fs.create(jar_1)
    cluster.fs.create(jar_2)
    cluster.fs.create(jar_3)
    cluster.fs.create(jar_4)
    cluster.fs.create(deployment_dir + '/' + jar_5)
    cluster.fs.create(deployment_dir + '/' + jar_6)

    class MockJob():
      XML_FILE_NAME = 'workflow.xml'

      def __init__(self):
        self.deployment_dir = deployment_dir
        self.nodes = [
            Node({'id': '1', 'type': 'mapreduce', 'properties': {'jar_path': jar_1}}),
            Node({'id': '2', 'type': 'mapreduce', 'properties': {'jar_path': jar_2}}),
            Node({'id': '3', 'type': 'java', 'properties': {'jar_path': jar_3}}),
            Node({'id': '4', 'type': 'java', 'properties': {'jar_path': jar_4}}),

            # Workspace relative paths
            Node({'id': '5', 'type': 'java', 'properties': {'jar_path': jar_5}}),
            Node({'id': '6', 'type': 'java', 'properties': {'jar_path': jar_6}})
        ]

    submission = Submission(user, job=MockJob(), fs=cluster.fs, jt=cluster.jt)

    submission._copy_files(deployment_dir, "<xml>My XML</xml>", {'prop1': 'val1'})
    submission._copy_files(external_deployment_dir, "<xml>My XML</xml>", {'prop1': 'val1'})

    assert_true(cluster.fs.exists(deployment_dir + '/workflow.xml'), deployment_dir)
    assert_true(cluster.fs.exists(deployment_dir + '/job.properties'), deployment_dir)

    # All sources still there
    assert_true(cluster.fs.exists(jar_1))
    assert_true(cluster.fs.exists(jar_2))
    assert_true(cluster.fs.exists(jar_3))
    assert_true(cluster.fs.exists(jar_4))
    assert_true(cluster.fs.exists(deployment_dir + '/' + jar_5))
    assert_true(cluster.fs.exists(deployment_dir + '/' + jar_6))

    # Lib
    deployment_dir = deployment_dir + '/lib'
    external_deployment_dir = external_deployment_dir + '/lib'

    if USE_LIBPATH_FOR_JARS.get():
      assert_true(jar_1 in submission.properties['oozie.libpath'])
      assert_true(jar_2 in submission.properties['oozie.libpath'])
      assert_true(jar_3 in submission.properties['oozie.libpath'])
      assert_true(jar_4 in submission.properties['oozie.libpath'])
      print deployment_dir + '/' + jar_5
      assert_true((deployment_dir + '/' + jar_5) in submission.properties['oozie.libpath'], submission.properties['oozie.libpath'])
      assert_true((deployment_dir + '/' + jar_6) in submission.properties['oozie.libpath'], submission.properties['oozie.libpath'])
    else:
      list_dir_workspace = cluster.fs.listdir(deployment_dir)
      list_dir_deployement = cluster.fs.listdir(external_deployment_dir)
  
      # All destinations there
      assert_true(cluster.fs.exists(deployment_dir + '/udf1.jar'), list_dir_workspace)
      assert_true(cluster.fs.exists(deployment_dir + '/udf2.jar'), list_dir_workspace)
      assert_true(cluster.fs.exists(deployment_dir + '/udf3.jar'), list_dir_workspace)
      assert_true(cluster.fs.exists(deployment_dir + '/udf4.jar'), list_dir_workspace)
      assert_true(cluster.fs.exists(deployment_dir + '/udf5.jar'), list_dir_workspace)
      assert_true(cluster.fs.exists(deployment_dir + '/udf6.jar'), list_dir_workspace)
  
      assert_true(cluster.fs.exists(external_deployment_dir + '/udf1.jar'), list_dir_deployement)
      assert_true(cluster.fs.exists(external_deployment_dir + '/udf2.jar'), list_dir_deployement)
      assert_true(cluster.fs.exists(external_deployment_dir + '/udf3.jar'), list_dir_deployement)
      assert_true(cluster.fs.exists(external_deployment_dir + '/udf4.jar'), list_dir_deployement)
      assert_true(cluster.fs.exists(external_deployment_dir + '/udf5.jar'), list_dir_deployement)
      assert_true(cluster.fs.exists(external_deployment_dir + '/udf6.jar'), list_dir_deployement)
  
      stats_udf1 = cluster.fs.stats(deployment_dir + '/udf1.jar')
      stats_udf2 = cluster.fs.stats(deployment_dir + '/udf2.jar')
      stats_udf3 = cluster.fs.stats(deployment_dir + '/udf3.jar')
      stats_udf4 = cluster.fs.stats(deployment_dir + '/udf4.jar')
      stats_udf5 = cluster.fs.stats(deployment_dir + '/udf5.jar')
      stats_udf6 = cluster.fs.stats(deployment_dir + '/udf6.jar')
  
      submission._copy_files('%s/workspace' % prefix, "<xml>My XML</xml>", {'prop1': 'val1'})
  
      assert_not_equal(stats_udf1['fileId'], cluster.fs.stats(deployment_dir + '/udf1.jar')['fileId'])
      assert_not_equal(stats_udf2['fileId'], cluster.fs.stats(deployment_dir + '/udf2.jar')['fileId'])
      assert_not_equal(stats_udf3['fileId'], cluster.fs.stats(deployment_dir + '/udf3.jar')['fileId'])
      assert_equal(stats_udf4['fileId'], cluster.fs.stats(deployment_dir + '/udf4.jar')['fileId'])
      assert_not_equal(stats_udf5['fileId'], cluster.fs.stats(deployment_dir + '/udf5.jar')['fileId'])
      assert_equal(stats_udf6['fileId'], cluster.fs.stats(deployment_dir + '/udf6.jar')['fileId'])

    # Test _create_file()
    submission._create_file(deployment_dir, 'test.txt', data='Test data')
    assert_true(cluster.fs.exists(deployment_dir + '/test.txt'), list_dir_workspace)

  finally:
    try:
      cluster.fs.rmtree(prefix)
    except:
      LOG.exception('failed to remove %s' % prefix)