示例#1
0
文件: api3.py 项目: zzzharpreet/hue
def _create_table(request, source, destination, start_time=-1):
  notebook = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file(source, destination, start_time)

  if request.POST.get('show_command'):
    return {'status': 0, 'commands': notebook.get_str()}
  else:
    return notebook.execute(request, batch=False)
示例#2
0
def test_generate_create_empty_kudu_table():
    source = json.loads(
        '''{"sourceType": "impala", "apiHelperType": "impala", "path": "", "inputFormat": "manual"}'''
    )
    destination = json.loads(
        '''{"sourceType": "impala", "name":"default.manual_empty_kudu","apiHelperType":"impala","description":"","outputFormat":"table","columns":[{"operations":[],"comment":"","nested":[],"name":"acct_client","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"tran_amount","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"tran_country_cd","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"vrfcn_city","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"vrfcn_city_lat","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"vrfcn_city_lon","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":false,"scale":0}],"bulkColumnNames":"acct_client,tran_amount,tran_country_cd,vrfcn_city,vrfcn_city_lat,vrfcn_city_lon","showProperties":false,"isTargetExisting":false,"isTargetChecking":false,"existingTargetUrl":"","tableName":"manual_kudu_table","databaseName":"default","tableFormat":"kudu","KUDU_DEFAULT_RANGE_PARTITION_COLUMN":{"values":[{"value":""}],"name":"VALUES","lower_val":0,"include_lower_val":"<=","upper_val":1,"include_upper_val":"<="},"KUDU_DEFAULT_PARTITION_COLUMN":{"columns":[],"range_partitions":[{"values":[{"value":""}],"name":"VALUES","lower_val":0,"include_lower_val":"<=","upper_val":1,"include_upper_val":"<="}],"name":"HASH","int_val":16},"tableFormats":[{"value":"text","name":"Text"},{"value":"parquet","name":"Parquet"},{"value":"kudu","name":"Kudu"},{"value":"csv","name":"Csv"},{"value":"avro","name":"Avro"},{"value":"json","name":"Json"},{"value":"regexp","name":"Regexp"},{"value":"orc","name":"ORC"}],"partitionColumns":[],"kuduPartitionColumns":[],"primaryKeys": ["acct_client"],"primaryKeyObjects":[],"importData":false,"useDefaultLocation":true,"nonDefaultLocation":"/user/hue/data/query-hive-360.csv","hasHeader":false,"useCustomDelimiters":false,"customFieldDelimiter":",","customCollectionDelimiter":"\\\\002","customMapDelimiter":"\\\\003","customRegexp":""}'''
    )

    path = {
        'isDir': False,
        'split': ('/user/hue/data', 'query-hive-360.csv'),
        'listdir': ['/user/hue/data']
    }
    request = MockRequest(fs=MockFs(path=path))

    sql = SQLIndexer(user=request.user,
                     fs=request.fs).create_table_from_a_file(
                         source, destination).get_str()

    assert_true(
        '''CREATE TABLE `default`.`manual_empty_kudu`
(
  `acct_client` string ,
  `tran_amount` double ,
  `tran_country_cd` string ,
  `vrfcn_city` string ,
  `vrfcn_city_lat` double ,
  `vrfcn_city_lon` double , PRIMARY KEY (acct_client)
)   STORED AS kudu ;''' in sql, sql)
示例#3
0
文件: sql_tests.py 项目: dulems/hue-1
def test_generate_create_text_table_with_data_partition():
  source = {u'sampleCols': [{u'operations': [], u'comment': u'', u'name': u'customers.id', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'partitionValue': u'', u'multiValued': False, u'unique': False, u'type': u'bigint', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'customers.name', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'partitionValue': u'', u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'customers.email_preferences', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'partitionValue': u'', u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'customers.addresses', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'partitionValue': u'', u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'customers.orders', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'partitionValue': u'', u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}], u'name': u'', u'inputFormat': u'file', u'format': {u'status': 0, u'fieldSeparator': u',', u'hasHeader': True, u'quoteChar': u'"', u'recordSeparator': u'\\n', u'type': u'csv'}, u'defaultName': u'default.customer_stats', u'show': True, u'tableName': u'', u'sample': [], u'apiHelperType': u'hive', u'inputFormatsAll': [{u'name': u'File', u'value': u'file'}, {u'name': u'Manually', u'value': u'manual'}, {u'name': u'SQL Query', u'value': u'query'}, {u'name': u'Table', u'value': u'table'}], u'query': u'', u'databaseName': u'default', u'table': u'', u'inputFormats': [{u'name': u'File', u'value': u'file'}, {u'name': u'Manually', u'value': u'manual'}, {u'name': u'SQL Query', u'value': u'query'}, {u'name': u'Table', u'value': u'table'}], u'path': u'/user/romain/customer_stats.csv', u'draggedQuery': u'', u'inputFormatsManual': [{u'name': u'Manually', u'value': u'manual'}], u'isObjectStore': False}
  destination = {u'KUDU_DEFAULT_PARTITION_COLUMN': {u'int_val': 16, u'name': u'HASH', u'columns': [], u'range_partitions': [{u'include_upper_val': u'<=', u'upper_val': 1, u'name': u'VALUES', u'include_lower_val': u'<=', u'lower_val': 0, u'values': [{u'value': u''}]}]}, u'isTargetChecking': False, u'tableName': u'customer_stats', u'outputFormatsList': [{u'name': u'Table', u'value': u'table'}, {u'name': u'Solr index', u'value': u'index'}, {u'name': u'File', u'value': u'file'}, {u'name': u'Database', u'value': u'database'}], u'customRegexp': u'', u'isTargetExisting': False, u'partitionColumns': [{u'operations': [], u'comment': u'', u'name': u'new_field_1', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': True, u'length': 100, u'partitionValue': u'AAA', u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}], u'useCustomDelimiters': False, u'apiHelperType': u'hive', u'kuduPartitionColumns': [], u'outputFormats': [{u'name': u'Table', u'value': u'table'}, {u'name': u'Solr index', u'value': u'index'}], u'customMapDelimiter': u'\\003', u'showProperties': False, u'useDefaultLocation': True, u'description': u'', u'primaryKeyObjects': [], u'customFieldDelimiter': u',', u'existingTargetUrl': u'', u'importData': True, u'databaseName': u'default', u'KUDU_DEFAULT_RANGE_PARTITION_COLUMN': {u'include_upper_val': u'<=', u'upper_val': 1, u'name': u'VALUES', u'include_lower_val': u'<=', u'lower_val': 0, u'values': [{u'value': u''}]}, u'primaryKeys': [], u'outputFormat': u'table', u'nonDefaultLocation': u'/user/romain/customer_stats.csv', u'name': u'default.customer_stats', u'tableFormat': u'text', 'ouputFormat': u'table', u'bulkColumnNames': u'customers.id,customers.name,customers.email_preferences,customers.addresses,customers.orders', u'columns': [{u'operations': [], u'comment': u'', u'name': u'customers.id', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'partitionValue': u'', u'multiValued': False, u'unique': False, u'type': u'bigint', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'customers.name', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'partitionValue': u'', u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'customers.email_preferences', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'partitionValue': u'', u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'customers.addresses', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'partitionValue': u'', u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'customers.orders', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'partitionValue': u'', u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}], u'hasHeader': True, u'tableFormats': [{u'name': u'Text', u'value': u'text'}, {u'name': u'Parquet', u'value': u'parquet'}, {u'name': u'Kudu', u'value': u'kudu'}, {u'name': u'Csv', u'value': u'csv'}, {u'name': u'Avro', u'value': u'avro'}, {u'name': u'Json', u'value': u'json'}, {u'name': u'Regexp', u'value': u'regexp'}, {u'name': u'ORC', u'value': u'orc'}], u'customCollectionDelimiter': u'\\002'}
  request = MockRequest(fs=MockFs())

  sql = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file(source, destination).get_str()

  assert_true('''USE default;''' in  sql, sql)

  assert_true('''CREATE TABLE `default`.`customer_stats`
(
  `customers.id` bigint ,
  `customers.name` string ,
  `customers.email_preferences` string ,
  `customers.addresses` string ,
  `customers.orders` string ) PARTITIONED BY (
  `new_field_1` string )
ROW FORMAT   DELIMITED
    FIELDS TERMINATED BY ','
    COLLECTION ITEMS TERMINATED BY '\\002'
    MAP KEYS TERMINATED BY '\\003'
  STORED AS TextFile TBLPROPERTIES("skip.header.line.count" = "1")
;''' in  sql, sql)

  assert_true('''LOAD DATA INPATH '/user/romain/customer_stats.csv' INTO TABLE `default`.`customer_stats` PARTITION (new_field_1='AAA');''' in  sql, sql)
示例#4
0
文件: sql_tests.py 项目: dulems/hue-1
def test_generate_create_parquet_table():
  source = json.loads('''{"name":"","sample":[["Bank Of America","3000000.0","US","Miami","37.6801986694","-121.92150116"],["Citi Bank","2800000.0","US","Richmond","37.5242004395","-77.4932022095"],["Deutsche Bank","2600000.0","US","Corpus Christi","40.7807998657","-73.9772033691"],["Thomson Reuters","2400000.0","US","Albany","35.7976989746","-78.6252975464"],["OpenX","2200000.0","US","Des Moines","40.5411987305","-119.586898804"]],"sampleCols":[{"operations":[],"comment":"","nested":[],"name":"acct_client","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"tran_amount","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"tran_country_cd","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"vrfcn_city","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"vrfcn_city_lat","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"vrfcn_city_lon","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":false,"scale":0}],"inputFormat":"file","inputFormatsAll":[{"value":"file","name":"File"},{"value":"manual","name":"Manually"},{"value":"query","name":"SQL Query"},{"value":"table","name":"Table"}],"inputFormatsManual":[{"value":"manual","name":"Manually"}],"inputFormats":[{"value":"file","name":"File"},{"value":"manual","name":"Manually"},{"value":"query","name":"SQL Query"},{"value":"table","name":"Table"}],"path":"/user/hue/data/query-hive-360.csv","isObjectStore":false,"table":"","tableName":"","databaseName":"default","apiHelperType":"hive","query":"","draggedQuery":"","format":{"type":"csv","fieldSeparator":",","recordSeparator":"\\n","quoteChar":"\\"","hasHeader":true,"status":0},"show":true,"defaultName":"default.query-hive-360"}''')
  destination = json.loads('''{"name":"default.parquet_table","apiHelperType":"hive","description":"","outputFormat":"table","outputFormatsList":[{"name":"Table","value":"table"},{"name":"Solr index","value":"index"},{"name":"File","value":"file"},{"name":"Database","value":"database"}],"outputFormats":[{"name":"Table","value":"table"},{"name":"Solr index","value":"index"}],"columns":[{"operations":[],"comment":"","nested":[],"name":"acct_client","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"tran_amount","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"tran_country_cd","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"vrfcn_city","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"vrfcn_city_lat","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"vrfcn_city_lon","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":false,"scale":0}],"bulkColumnNames":"acct_client,tran_amount,tran_country_cd,vrfcn_city,vrfcn_city_lat,vrfcn_city_lon","showProperties":false,"isTargetExisting":false,"isTargetChecking":false,"existingTargetUrl":"","tableName":"parquet_table","databaseName":"default","tableFormat":"parquet","KUDU_DEFAULT_RANGE_PARTITION_COLUMN":{"values":[{"value":""}],"name":"VALUES","lower_val":0,"include_lower_val":"<=","upper_val":1,"include_upper_val":"<="},"KUDU_DEFAULT_PARTITION_COLUMN":{"columns":[],"range_partitions":[{"values":[{"value":""}],"name":"VALUES","lower_val":0,"include_lower_val":"<=","upper_val":1,"include_upper_val":"<="}],"name":"HASH","int_val":16},"tableFormats":[{"value":"text","name":"Text"},{"value":"parquet","name":"Parquet"},{"value":"kudu","name":"Kudu"},{"value":"csv","name":"Csv"},{"value":"avro","name":"Avro"},{"value":"json","name":"Json"},{"value":"regexp","name":"Regexp"},{"value":"orc","name":"ORC"}],"partitionColumns":[],"kuduPartitionColumns":[],"primaryKeys":[],"primaryKeyObjects":[],"importData":true,"useDefaultLocation":true,"nonDefaultLocation":"/user/hue/data/query-hive-360.csv","hasHeader":true,"useCustomDelimiters":false,"customFieldDelimiter":",","customCollectionDelimiter":"\\\\002","customMapDelimiter":"\\\\003","customRegexp":""}''')

  path = {'isDir': False, 'split': ('/user/hue/data', 'query-hive-360.csv'), 'listdir': ['/user/hue/data']}
  request = MockRequest(fs=MockFs(path=path))

  sql = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file(source, destination).get_str()

  assert_true('''USE default;''' in  sql, sql)

  assert_true('''CREATE EXTERNAL TABLE `default`.`hue__tmp_parquet_table`
(
  `acct_client` string ,
  `tran_amount` double ,
  `tran_country_cd` string ,
  `vrfcn_city` string ,
  `vrfcn_city_lat` double ,
  `vrfcn_city_lon` double ) ROW FORMAT   DELIMITED
    FIELDS TERMINATED BY ','
    COLLECTION ITEMS TERMINATED BY '\\002'
    MAP KEYS TERMINATED BY '\\003'
  STORED AS TextFile LOCATION '/user/hue/data'
TBLPROPERTIES("skip.header.line.count" = "1")
;''' in  sql, sql)

  assert_true('''CREATE TABLE `default`.`parquet_table`
        STORED AS parquet
        AS SELECT *
        FROM `default`.`hue__tmp_parquet_table`;
''' in  sql, sql)

  assert_true('''DROP TABLE IF EXISTS `default`.`hue__tmp_parquet_table`;
''' in  sql, sql)
示例#5
0
文件: sql_tests.py 项目: dulems/hue-1
def test_generate_create_kudu_table_with_data():
  source = {u'apiHelperType': 'hive', u'sampleCols': [], u'name': u'', u'inputFormat': u'file', u'format': {u'quoteChar': u'"', u'recordSeparator': u'\\n', u'type': u'csv', u'hasHeader': True, u'fieldSeparator': u','}, u'show': True, u'tableName': u'', u'sample': [], u'defaultName': u'index_data', u'query': u'', u'databaseName': u'default', u'table': u'', u'inputFormats': [{u'name': u'File', u'value': u'file'}, {u'name': u'Manually', u'value': u'manual'}], u'path': u'/user/admin/index_data.csv', u'draggedQuery': u'', u'isObjectStore': False}
  destination = {u'KUDU_DEFAULT_PARTITION_COLUMN': {u'int_val': 16, u'name': u'HASH', u'columns': [], u'range_partitions': [{u'include_upper_val': u'<=', u'upper_val': 1, u'name': u'VALUES', u'include_lower_val': u'<=', u'lower_val': 0, u'values': [{u'value': u''}]}]}, u'tableName': u'index_data', u'outputFormatsList': [{u'name': u'Table', u'value': u'table'}, {u'name': u'Solr+index', u'value': u'index'}, {u'name': u'File', u'value': u'file'}, {u'name': u'Database', u'value': u'database'}], u'customRegexp': u'', u'isTargetExisting': False, u'partitionColumns': [], u'useCustomDelimiters': True, u'kuduPartitionColumns': [{u'int_val': 16, u'name': u'HASH', u'columns': [u'id'], u'range_partitions': [{u'include_upper_val': u'<=', u'upper_val': 1, u'name': u'VALUES', u'include_lower_val': u'<=', u'lower_val': 0, u'values': [{u'value': u''}]}]}], u'outputFormats': [{u'name': u'Table', u'value': u'table'}, {u'name': u'Solr+index', u'value': u'index'}], u'customMapDelimiter': None, u'showProperties': False, u'useDefaultLocation': True, u'description': u'Big Data', u'primaryKeyObjects': [{u'operations': [], u'comment': u'', u'name': u'id', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}], u'customFieldDelimiter': u',', u'existingTargetUrl': u'', u'importData': True, u'databaseName': u'default', u'KUDU_DEFAULT_RANGE_PARTITION_COLUMN': {u'include_upper_val': u'<=', u'upper_val': 1, u'name': u'VALUES', u'include_lower_val': u'<=', u'lower_val': 0, u'values': [{u'value': u''}]}, u'primaryKeys': [u'id'], u'outputFormat': u'table', u'nonDefaultLocation': u'/user/admin/index_data.csv', u'name': u'index_data', u'tableFormat': u'kudu', u'bulkColumnNames': u'business_id,cool,date,funny,id,stars,text,type,useful,user_id,name,full_address,latitude,longitude,neighborhoods,open,review_count,state', u'columns': [{u'operations': [], u'comment': u'', u'name': u'business_id', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'cool', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'bigint', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'date', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'funny', u'level': 0, u'scale':4, u'precision':10, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'decimal', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'id', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'stars', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'bigint', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'text', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'type', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'useful', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'bigint', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'user_id', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'name', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'full_address', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'latitude', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'double', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'longitude', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'double', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'neighborhoods', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'open', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'review_count', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'bigint', u'showProperties': False, u'keep': True}, {u'operations': [], u'comment': u'', u'name': u'state', u'level': 0, u'keyType': u'string', u'required': False, u'nested': [], u'isPartition': False, u'length': 100, u'multiValued': False, u'unique': False, u'type': u'string', u'showProperties': False, u'keep': True}], u'hasHeader': True, u'tableFormats': [{u'name': u'Text', u'value': u'text'}, {u'name': u'Parquet', u'value': u'parquet'}, {u'name': u'Json', u'value': u'json'}, {u'name': u'Kudu', u'value': u'kudu'}, {u'name': u'Avro', u'value': u'avro'}, {u'name': u'Regexp', u'value': u'regexp'}, {u'name': u'RCFile', u'value': u'rcfile'}, {u'name': u'ORC', u'value': u'orc'}, {u'name': u'SequenceFile', u'value': u'sequencefile'}], u'customCollectionDelimiter': None}
  request = MockRequest(fs=MockFs())

  sql = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file(source, destination).get_str()

  assert_true('''DROP TABLE IF EXISTS `default`.`hue__tmp_index_data`;''' in  sql, sql)

  assert_true('''CREATE EXTERNAL TABLE `default`.`hue__tmp_index_data`
(
  `business_id` string ,
  `cool` bigint ,
  `date` string ,
  `funny` decimal(10, 4) ,
  `id` string ,
  `stars` bigint ,
  `text` string ,
  `type` string ,
  `useful` bigint ,
  `user_id` string ,
  `name` string ,
  `full_address` string ,
  `latitude` double ,
  `longitude` double ,
  `neighborhoods` string ,
  `open` string ,
  `review_count` bigint ,
  `state` string ) COMMENT "Big Data"
ROW FORMAT   DELIMITED
    FIELDS TERMINATED BY ','
  STORED AS TextFile LOCATION '/A'
TBLPROPERTIES("skip.header.line.count" = "1")''' in  sql, sql)

  assert_true('''CREATE TABLE `default`.`index_data` COMMENT "Big Data"
        PRIMARY KEY (id)
        PARTITION BY HASH PARTITIONS 16
        STORED AS kudu
        TBLPROPERTIES(
        'kudu.num_tablet_replicas' = '1'
        )
        AS SELECT `id`, `business_id`, `cool`, `date`, `funny`, `stars`, `text`, `type`, `useful`, `user_id`, `name`, `full_address`, `latitude`, `longitude`, `neighborhoods`, `open`, `review_count`, `state`
        FROM `default`.`hue__tmp_index_data`;''' in  sql, sql)
示例#6
0
def _create_table(request, source, destination, start_time=-1):
  notebook = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file(source, destination, start_time)
  return notebook.execute(request, batch=False)
示例#7
0
def _envelope_job(request, file_format, destination, start_time=None, lib_path=None):
  collection_name = destination['name']
  indexer = EnvelopeIndexer(request.user, request.fs)

  lib_path = '/tmp/envelope-0.5.0.jar'
  input_path = None

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % file_format["path"]
    properties = {
      'format': 'json'
    }
  elif file_format['inputFormat'] == 'stream':
    if file_format['streamSelection'] == 'sfdc':
      properties = {
        'streamSelection': file_format['streamSelection'],
        'streamUsername': file_format['streamUsername'],
        'streamPassword': file_format['streamPassword'],
        'streamToken': file_format['streamToken'],
        'streamEndpointUrl': file_format['streamEndpointUrl'],
        'streamObject': file_format['streamObject'],
      }
    elif file_format['streamSelection'] == 'kafka':
      manager = ManagerApi()
      properties = {
        "brokers": manager.get_kafka_brokers(),
        "output_table": "impala::%s" % collection_name,
        "topics": file_format['kafkaSelectedTopics'],
        "kafkaFieldType": file_format['kafkaFieldType'],
        "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'],
        "kafkaFieldNames": file_format['kafkaFieldNames'],
        "kafkaFieldTypes": file_format['kafkaFieldTypes']
      }

    if destination['outputFormat'] == 'table':
      if destination['isTargetExisting']:
        # Todo: check if format matches
        pass
      else:
        sql = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file(file_format, destination).get_str()
        print sql
      if destination['tableFormat'] == 'kudu':
        manager = ManagerApi()
        properties["output_table"] = "impala::%s" % collection_name
        properties["kudu_master"] = manager.get_kudu_master()
      else:
        properties['output_table'] = collection_name
    elif destination['outputFormat'] == 'file':
      properties['path'] = file_format["path"]
      properties['format'] = file_format['tableFormat'] # or csv
    elif destination['outputFormat'] == 'index':
      properties['collectionName'] = collection_name
      properties['connection'] = SOLR_URL.get()
      if destination['isTargetExisting']:
        # Todo: check if format matches
        pass
      else:
        client = SolrClient(request.user)
        kwargs = {}
        _create_solr_collection(request.user, request.fs, client, destination, collection_name, kwargs)

  properties["app_name"] = 'Data Ingest'
  properties["inputFormat"] = file_format['inputFormat']
  properties["ouputFormat"] = destination['ouputFormat']
  properties["streamSelection"] = file_format["streamSelection"]

  envelope = indexer.generate_config(properties)

  return indexer.run(request, collection_name, envelope, input_path, start_time=start_time, lib_path=lib_path)
示例#8
0
文件: api3.py 项目: zhengwei2020/hue
def _envelope_job(request,
                  file_format,
                  destination,
                  start_time=None,
                  lib_path=None):
    collection_name = destination['name']
    indexer = EnvelopeIndexer(request.user, request.fs)

    lib_path = None  # Todo optional input field
    input_path = None

    if file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])
        input_path = table_metadata.path_location
    elif file_format['inputFormat'] == 'file':
        input_path = file_format["path"]
        properties = {'input_path': input_path, 'format': 'csv'}
    elif file_format['inputFormat'] == 'stream' and file_format[
            'streamSelection'] == 'flume':
        pass
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            manager = ManagerApi()
            properties = {
                "brokers": manager.get_kafka_brokers(),
                "topics": file_format['kafkaSelectedTopics'],
                "kafkaFieldType": file_format['kafkaFieldType'],
                "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'],
            }

            if file_format.get(
                    'kafkaSelectedTopics') == 'NavigatorAuditEvents':
                schema_fields = MorphlineIndexer.get_kept_field_list(
                    file_format['sampleCols'])
                properties.update({
                    "kafkaFieldNames":
                    ', '.join([_field['name'] for _field in schema_fields]),
                    "kafkaFieldTypes":
                    ', '.join([_field['type'] for _field in schema_fields])
                })
            else:
                properties.update({
                    "kafkaFieldNames":
                    file_format['kafkaFieldNames'],
                    "kafkaFieldTypes":
                    file_format['kafkaFieldTypes']
                })

            if True:
                properties['window'] = ''
            else:  # For "KafkaSQL"
                properties['window'] = '''
            window {
                enabled = true
                milliseconds = 60000
            }'''
    elif file_format['inputFormat'] == 'connector':
        if file_format['streamSelection'] == 'flume':
            properties = {
                'streamSelection':
                file_format['streamSelection'],
                'channelSourceHosts':
                file_format['channelSourceHosts'],
                'channelSourceSelectedHosts':
                file_format['channelSourceSelectedHosts'],
                'channelSourcePath':
                file_format['channelSourcePath'],
            }
        else:
            # sfdc
            properties = {
                'streamSelection': file_format['streamSelection'],
                'streamUsername': file_format['streamUsername'],
                'streamPassword': file_format['streamPassword'],
                'streamToken': file_format['streamToken'],
                'streamEndpointUrl': file_format['streamEndpointUrl'],
                'streamObject': file_format['streamObject'],
            }

    if destination['outputFormat'] == 'table':
        if destination['isTargetExisting']:  # Todo: check if format matches
            pass
        else:
            destination['importData'] = False  # Avoid LOAD DATA
            if destination['tableFormat'] == 'kudu':
                properties['kafkaFieldNames'] = properties[
                    'kafkaFieldNames'].lower(
                    )  # Kudu names should be all lowercase
            # Create table
            if not request.POST.get('show_command'):
                SQLIndexer(user=request.user,
                           fs=request.fs).create_table_from_a_file(
                               file_format, destination).execute(request)

        if destination['tableFormat'] == 'kudu':
            manager = ManagerApi()
            properties["output_table"] = "impala::%s" % collection_name
            properties["kudu_master"] = manager.get_kudu_master()
        else:
            properties['output_table'] = collection_name
    elif destination['outputFormat'] == 'stream':
        manager = ManagerApi()
        properties['brokers'] = manager.get_kafka_brokers()
        properties['topics'] = file_format['kafkaSelectedTopics']
        properties['kafkaFieldDelimiter'] = file_format['kafkaFieldDelimiter']
    elif destination['outputFormat'] == 'file':
        properties['path'] = file_format["path"]
        if file_format['inputFormat'] == 'stream':
            properties['format'] = 'csv'
        else:
            properties['format'] = file_format['tableFormat']  # or csv
    elif destination['outputFormat'] == 'index':
        properties['collectionName'] = collection_name
        properties['connection'] = SOLR_URL.get()

    properties["app_name"] = 'Data Ingest'
    properties["inputFormat"] = file_format['inputFormat']
    properties["ouputFormat"] = destination['ouputFormat']
    properties["streamSelection"] = file_format["streamSelection"]

    configs = indexer.generate_config(properties)

    if request.POST.get('show_command'):
        return {'status': 0, 'commands': configs['envelope.conf']}
    else:
        return indexer.run(request,
                           collection_name,
                           configs,
                           input_path,
                           start_time=start_time,
                           lib_path=lib_path)
示例#9
0
def _envelope_job(request,
                  file_format,
                  destination,
                  start_time=None,
                  lib_path=None):
    collection_name = destination['name']
    indexer = EnvelopeIndexer(request.user, request.fs)

    lib_path = None  # Todo optional input field
    input_path = None

    if file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])
        input_path = table_metadata.path_location
    elif file_format['inputFormat'] == 'file':
        input_path = file_format["path"]
        properties = {'input_path': input_path, 'format': 'csv'}
    elif file_format['inputFormat'] == 'stream' and file_format[
            'streamSelection'] == 'flume':
        pass
    elif file_format['inputFormat'] in ('stream', 'sfdc'):
        if file_format['inputFormat'] == 'sfdc':
            properties = {
                'streamSelection': file_format['streamSelection'],
                'streamUsername': file_format['streamUsername'],
                'streamPassword': file_format['streamPassword'],
                'streamToken': file_format['streamToken'],
                'streamEndpointUrl': file_format['streamEndpointUrl'],
                'streamObject': file_format['streamObject'],
            }
        elif file_format['streamSelection'] == 'kafka':
            manager = ManagerApi()
            properties = {
                "brokers": manager.get_kafka_brokers(),
                "topics": file_format['kafkaSelectedTopics'],
                "kafkaFieldType": file_format['kafkaFieldType'],
                "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'],
                "kafkaFieldNames": file_format['kafkaFieldNames'],
                "kafkaFieldTypes": file_format['kafkaFieldTypes']
            }

            if True:
                properties['window'] = ''
            else:  # For "KafkaSQL"
                properties['window'] = '''
            window {
                enabled = true
                milliseconds = 60000
            }'''

        if destination['outputFormat'] == 'table':
            if destination['isTargetExisting']:
                # Todo: check if format matches
                pass
            else:
                sql = SQLIndexer(user=request.user,
                                 fs=request.fs).create_table_from_a_file(
                                     file_format, destination).get_str()
                print sql
            if destination['tableFormat'] == 'kudu':
                manager = ManagerApi()
                properties["output_table"] = "impala::%s" % collection_name
                properties["kudu_master"] = manager.get_kudu_master()
            else:
                properties['output_table'] = collection_name
        elif destination['outputFormat'] == 'file':
            properties['path'] = file_format["path"]
            if file_format['inputFormat'] == 'stream':
                properties['format'] = 'csv'
            else:
                properties['format'] = file_format['tableFormat']  # or csv
        elif destination['outputFormat'] == 'index':
            properties['collectionName'] = collection_name
            properties['connection'] = SOLR_URL.get()


# No needed anymore
#       if destination['isTargetExisting']:
#         # Todo: check if format matches
#         pass
#       else:
#         client = SolrClient(request.user)
#         kwargs = {}
#         _create_solr_collection(request.user, request.fs, client, destination, collection_name, kwargs)

    if destination['outputFormat'] == 'stream':
        manager = ManagerApi()
        properties['brokers'] = manager.get_kafka_brokers()
        properties['topics'] = file_format['kafkaSelectedTopics']
        properties['kafkaFieldDelimiter'] = file_format['kafkaFieldDelimiter']

    properties["app_name"] = 'Data Ingest'
    properties["inputFormat"] = file_format['inputFormat']
    properties["ouputFormat"] = destination['ouputFormat']
    properties["streamSelection"] = file_format["streamSelection"]

    envelope = indexer.generate_config(properties)

    return indexer.run(request,
                       collection_name,
                       envelope,
                       input_path,
                       start_time=start_time,
                       lib_path=lib_path)