Пример #1
0
  def readCarbon(key, secret, end_point, path, label):
    from jnius import autoclass
    java_list_class = autoclass('java.util.ArrayList')
    projection_list = java_list_class()
    projection_list.add("name")

    reader = CarbonReader() \
      .builder() \
      .withBatch(780) \
      .withFolder(path) \
      .withHadoopConf("fs.s3a.access.key", key) \
      .withHadoopConf("fs.s3a.secret.key", secret) \
      .withHadoopConf("fs.s3a.endpoint", end_point) \
      .projection(projection_list) \
      .filterEqual("name", label) \
      .build()

    data_list = []
    while reader.hasNext():
      rows = reader.readNextBatchRow()
      for row in rows:
        data_list.append(row)

    reader.close()
    return data_list
Пример #2
0
def test_run_read_carbon_by_file_lists():
  from jnius import autoclass
  java_list_class = autoclass('java.util.ArrayList')

  java_list = java_list_class()
  java_list.add(LOCAL_DATA_PATH + "/sub1/part-0-1196034485149392_batchno0-0-null-1196033673787967.carbondata")
  java_list.add(LOCAL_DATA_PATH + "/sub2/part-0-1196034758543568_batchno0-0-null-1196034721553227.carbondata")

  projection_list = java_list_class()
  projection_list.add("name")
  projection_list.add("age")
  projection_list.add("image1")
  projection_list.add("image2")
  projection_list.add("image3")

  reader = CarbonReader() \
    .builder() \
    .withFileLists(java_list) \
    .withBatch(1000) \
    .projection(projection_list) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 20 == num
  reader.close()
Пример #3
0
def test_run_write_carbon_binary_base64_encode():
    jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary}]"
    path = "/tmp/data/writeCarbon" + str(time.time())

    if os.path.exists(path):
        shutil.rmtree(path)

    jpg_path = IMAGE_DATA_PATH + "/carbondatalogo.jpg"

    writer = CarbonWriter() \
      .builder() \
      .outputPath(path) \
      .withCsvInput(jsonSchema) \
      .writtenBy("pycarbon") \
      .build()

    with open(jpg_path, mode='rb+') as file_object:
        content = file_object.read()

    for i in range(0, 10):
        from jnius import autoclass

        arrayListClass = autoclass("java.util.ArrayList")
        data_list = arrayListClass()
        data_list.add("pycarbon")
        data_list.add(str(i))
        data_list.add(str(i * 10))
        data_list.add(base64.b64encode(content))
        writer.write(data_list.toArray())

    writer.close()

    reader = CarbonReader() \
      .builder() \
      .withFolder(path) \
      .withBatch(1000) \
      .build()

    i = 0
    while reader.hasNext():
        rows = reader.readNextBatchRow()
        for row in rows:
            i += 1
            for column in row:
                from jnius.jnius import ByteArray
                if 1 == i and isinstance(column,
                                         ByteArray) and len(column) > 1000:
                    with open(path + "/image.jpg", 'wb+') as file_object:
                        file_object.write(base64.b64decode(column.tostring()))

    assert 10 == i
    reader.close()

    shutil.rmtree(path)
Пример #4
0
def test_run_read_carbon_from_local():
  reader = CarbonReader() \
    .builder() \
    .withBatch(780) \
    .withFolder(LOCAL_DATA_PATH) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 30 == num
  reader.close()
Пример #5
0
def test_run_read_carbon_by_file():
  reader = CarbonReader() \
    .builder() \
    .withFile(LOCAL_DATA_PATH + "/sub1/part-0-1196034485149392_batchno0-0-null-1196033673787967.carbondata") \
    .withBatch(1000) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 10 == num
  reader.close()
Пример #6
0
def test_run_read_carbon_from_local_for_filter():
  reader = CarbonReader() \
    .builder() \
    .withBatch(10) \
    .withFolder(LOCAL_DATA_PATH) \
    .filterEqual("name", "robot0") \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 3 == num
  reader.close()
Пример #7
0
def test_run_read_carbon_from_obs():
  reader = CarbonReader() \
    .builder() \
    .withBatch(1000) \
    .withFolder(S3_DATA_PATH) \
    .withHadoopConf("fs.s3a.access.key", pytest.config.getoption("--access_key")) \
    .withHadoopConf("fs.s3a.secret.key", pytest.config.getoption("--secret_key")) \
    .withHadoopConf("fs.s3a.endpoint", pytest.config.getoption("--end_point")) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 30 == num
  reader.close()
Пример #8
0
def test_run_write_carbon():
    jsonSchema = "[{stringField:string},{shortField:short},{intField:int}]"
    path = "/tmp/data/writeCarbon" + str(time.time())

    if os.path.exists(path):
        shutil.rmtree(path)

    writer = CarbonWriter() \
      .builder() \
      .outputPath(path) \
      .withCsvInput(jsonSchema) \
      .writtenBy("pycarbon") \
      .build()

    for i in range(0, 10):
        from jnius import autoclass
        arrayListClass = autoclass("java.util.ArrayList")
        data_list = arrayListClass()
        data_list.add("pycarbon")
        data_list.add(str(i))
        data_list.add(str(i * 10))
        writer.write(data_list.toArray())

    writer.close()

    reader = CarbonReader() \
      .builder() \
      .withFolder(path) \
      .withBatch(1000) \
      .build()

    i = 0
    while reader.hasNext():
        rows = reader.readNextBatchRow()
        i += len(rows)

    assert 10 == i
    reader.close()

    shutil.rmtree(path)
Пример #9
0
def test_run_read_carbon_from_local_for_projection():
  from jnius import autoclass
  java_list_class = autoclass('java.util.ArrayList')
  projection_list = java_list_class()
  projection_list.add("name")
  projection_list.add("age")
  projection_list.add("image1")
  projection_list.add("image2")
  projection_list.add("image3")

  reader = CarbonReader() \
    .builder() \
    .withBatch(100) \
    .withFolder(LOCAL_DATA_PATH) \
    .projection(projection_list) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 30 == num
  reader.close()
Пример #10
0
def test_run_read_carbon_from_obs_for_filter():
  from jnius import autoclass
  java_list_class = autoclass('java.util.ArrayList')
  projection_list = java_list_class()
  projection_list.add("name")

  reader = CarbonReader() \
    .builder() \
    .withBatch(780) \
    .withFolder(S3_DATA_PATH) \
    .withHadoopConf("fs.s3a.access.key", pytest.config.getoption("--access_key")) \
    .withHadoopConf("fs.s3a.secret.key", pytest.config.getoption("--secret_key")) \
    .withHadoopConf("fs.s3a.endpoint", pytest.config.getoption("--end_point")) \
    .projection(projection_list) \
    .filterEqual("name", "robot0") \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 3 == num
  reader.close()
Пример #11
0
def test_run_write_carbon_binary_base64_encode_decodeInJava_many_files():
    jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{txtField:string}]"
    path = "/tmp/data/writeCarbon" + str(time.time())

    if os.path.exists(path):
        shutil.rmtree(path)

    jpg_path = IMAGE_DATA_PATH + "/flowers"

    from jnius import autoclass

    sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil")
    jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg')

    writer = CarbonWriter() \
      .builder() \
      .outputPath(path) \
      .withCsvInput(jsonSchema) \
      .writtenBy("pycarbon") \
      .withLoadOption("binary_decoder", "base64") \
      .withPageSizeInMb(1) \
      .build()

    for i in range(0, jpg_files.size()):
        jpg_path = jpg_files.get(i)
        with open(jpg_path, mode='rb+') as file_object:
            content = file_object.read()

        with open(str(jpg_path).replace('.jpg', '.txt'),
                  mode='r+') as file_object:
            txt = file_object.read()

        arrayListClass = autoclass("java.util.ArrayList")
        data_list = arrayListClass()
        data_list.add("pycarbon")
        data_list.add(str(i))
        data_list.add(str(i * 10))
        data_list.add(base64.b64encode(content))
        data_list.add(txt)
        writer.write(data_list.toArray())

    writer.close()

    reader = CarbonReader() \
      .builder() \
      .withFolder(path) \
      .withBatch(1000) \
      .build()

    i = 0
    while reader.hasNext():
        rows = reader.readNextBatchRow()
        for row in rows:
            i += 1
            for column in row:
                from jnius.jnius import ByteArray
                if isinstance(column,
                              ByteArray) and len(column) > 1000 and i < 20:
                    with open(path + "/image" + str(i) + ".jpg",
                              'wb+') as file_object:
                        file_object.write((column.tostring()))

    assert 3 == i
    reader.close()

    shutil.rmtree(path)
Пример #12
0
def test_download_carbon_from_obs_and_read():
  key = pytest.config.getoption("--access_key")
  secret = pytest.config.getoption("--secret_key")
  end_point = pytest.config.getoption("--end_point")

  def list_obs_files(obs_client, bucket_name, prefix):
    files = []

    pageSize = 1000
    index = 1
    nextMarker = None
    while True:
      resp = obs_client.listObjects(bucket_name, prefix=prefix, max_keys=pageSize, marker=nextMarker)
      for content in resp.body.contents:
        files.append(content.key)
      if not resp.body.is_truncated:
        break
      nextMarker = resp.body.next_marker
      index += 1

    return files

  def read_obs_files(key, secret, end_point, bucket_name, prefix, downloadPath):
    obsClient = ObsClient(
      access_key_id=key,
      secret_access_key=secret,
      server=end_point,
      long_conn_mode=True
    )
    files = list_obs_files(obsClient, bucket_name, prefix)
    numOfFiles = len(files)
    print(numOfFiles)
    num = 0
    for file in files:
      num = num + 1
      # obsClient.l
      obsClient.getObject(bucket_name, file, downloadPath=downloadPath + file)
      # resp.body.buffer
      if 0 == num % (numOfFiles / 10):
        print(str(num) + ":" + file)

    obsClient.close()

  downloadPath = '/tmp/carbonbinary/'

  if os.path.exists(downloadPath):
    shutil.rmtree(downloadPath)

  read_obs_files(key, secret, end_point, 'sdk', 'binary', downloadPath)

  reader = CarbonReader() \
    .builder() \
    .withBatch(1000) \
    .withFolder(downloadPath) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 30 == num
  reader.close()

  shutil.rmtree(downloadPath)