def autocomplete(self, snippet, database=None, table=None, column=None, nested=None): db = HbaseApi(self.user) cluster_name = database response = {} try: if database is None: response['databases'] = [ cluster['name'] for cluster in db.getClusters() ] elif table is None: tables_meta = db.getTableList(cluster_name) response['tables_meta'] = [ _table['name'] for _table in tables_meta if _table['enabled'] ] elif column is None: tables_meta = db.get(cluster_name, table) response['columns'] = [] else: raise PopupException('Could not find column `%s`.`%s`.`%s`' % (database, table, column)) except Exception, e: LOG.warn('Autocomplete data fetching error: %s' % e) response['code'] = 500 response['error'] = e.message
def put(self, request, original_variant, pk): # Allow to modify a variant in HBase/Impala f = open('/tmp/superhello.txt','w') f.write(json.dumps(request.data)) f.close() # We convert the original and modified data to flatjson fc = formatConverters(input_file='stuff.json',output_file='stuff.json') original_flatjson = fc.convertVariantJsonToFlatJson(json_data=original_variant) modified_flatjson = fc.convertVariantJsonToFlatJson(json_data=request.data) # We convert the data to hbase, and we modify directly some fields (note: the keys are almost the same for hbase and impala) hbase_data = fc.convertVariantFlatJsonToHbase(original_data=original_flatjson,modified_data=modified_flatjson) # Impala - We make the query query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) query = hql_query("INSERT INTO variant("+",".join(query_data)+")") handle = db.execute_and_wait(query, timeout_sec=5.0) if handle: db.close(handle) else: raise Exception("Impossible to create the variant...") # HBase - We add the data in that table too hbaseApi = HbaseApi(user=request.user) currentCluster = hbaseApi.getClusters().pop() rowkey = pk hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data)
def __init__(self, request=None, pk=None, impala_data=None, *args, **kwargs): if request is None and pk is None and impala_data is None: return super(VariantSerializer, self).__init__(*args, **kwargs) # We take the information in the database if don't have it. As we are interested in one variant, we use HBase if impala_data is None: # Documentation: https://github.com/cloudera/hue/blob/master/apps/hbase/src/hbase/api.py hbaseApi = HbaseApi(user=request.user) currentCluster = hbaseApi.getClusters().pop() # We arbitrary take 100 rows (to be able to catch every different alternate. Maybe we should take more, not sure about that (we cannot # set an endkey with the hbase api). Most of the times 100 rows will be way more than enough variant = hbaseApi.getRows(cluster=currentCluster['name'], tableName='variants', columns=['R','I','F'], startRowKey=pk, numRows=100, prefix=False) else: variant = "raw data we got from impala..." if variant is not None: # We load it in the current object if impala_data is None: json_data = hbaseToJson(variant) else: json_data = parquetToJson(impala_data) d = jsonToSerializerData(json_data, self.fields, 'variants') d['calls'] = [] for variants_call in json_data['variants.calls[]']: call = VariantCallSerializer(variantcall_data=variants_call) d['calls'].append(call.data) # Load a specific variant kwargs['data'] = d super(VariantSerializer, self).__init__(*args, **kwargs) self.is_valid()
def handle_noargs(self, **options): api = HbaseApi() cluster_name = api.getClusters()[0]['name'] # Currently pick first configured cluster # Check connectivity api.connectCluster(cluster_name) self.create_analytics_table(api, cluster_name) self.load_analytics_table(api, cluster_name) self.create_binary_table(api, cluster_name) self.load_binary_table(api, cluster_name)
def __init__(self, pUser): self.api = HbaseApi(pUser) try: self.clusters = self.api.getClusters() except Exception, e: if 'Could not connect to' in e.message: raise PopupException( _("HBase DAO Thrift 1 server cannot be contacted: %s") % e.message) else: error_msg = e.message.split('\n', 1)[0] raise PopupException(_("HBase DAO Error: %s") % error_msg)
def api_router(request, url): # On split, deserialize anything def safe_json_load(raw): try: return json.loads(re.sub(r'(?:\")([0-9]+)(?:\")', r'\1', str(raw))) except: return raw def deserialize(data): if type(data) == dict: special_type = get_thrift_type(data.pop('hue-thrift-type', '')) if special_type: return special_type(data) if hasattr(data, "__iter__"): for i, item in enumerate(data): data[i] = deserialize( item) # Sets local binding, needs to set in data return data decoded_url_params = [ urllib.unquote(arg) for arg in re.split(r'(?<!\\)/', url.strip('/')) ] url_params = [ safe_json_load( (arg, request.POST.get(arg[0:16], arg))[arg[0:15] == 'hbase-post-key-']) for arg in decoded_url_params ] # Deserialize later if request.POST.get('dest', False): url_params += [request.FILES.get(request.REQUEST.get('dest'))] return api_dump(HbaseApi(request.user).query(*url_params))
def test_list_tables(self): if not is_live_cluster(): raise SkipTest('HUE-2910: Skipping because test is not reentrant') for cluster in HbaseApi(self.user).getClusters(): resp = self.client.post('/hbase/api/getTableList/' + cluster['name']) content = json.loads(resp.content) assert_true('data' in content, content)
def handle(self, *args, **options): if args: user = args[0] else: user = install_sample_user() api = HbaseApi(user=user) cluster_name = api.getClusters()[0]['name'] # Currently pick first configured cluster # Check connectivity api.connectCluster(cluster_name) self.create_analytics_table(api, cluster_name) self.load_analytics_table(api, cluster_name) self.create_binary_table(api, cluster_name) self.load_binary_table(api, cluster_name)
def config_validator(user): res = [] from hbase.api import HbaseApi from hbase.settings import NICE_NAME try: if not 'test' in sys.argv: # Avoid tests hanging api = HbaseApi(user=user) cluster_name = api.getClusters()[0][ 'name'] # Currently pick first configured cluster # Check connectivity api.connectCluster(cluster_name) api.getTableList(cluster_name) except Exception as e: print(e) if 'Could not connect' in str(e): msg = "The application won't work without a running HBase Thrift Server v1." else: msg = 'Failed to authenticate to HBase Thrift Server, check authentication configurations.' LOG.exception(msg) res.append((NICE_NAME, _(msg))) if get_thrift_transport() == "framed": msg = "Hbase config thrift_transport=framed is not supported" LOG.exception(msg) res.append((NICE_NAME, _(msg))) res.extend(validate_thrift_transport(THRIFT_TRANSPORT)) return res
def handle(self, *args, **options): if args: user = args[0] else: user = install_sample_user() api = HbaseApi(user=user) cluster_name = api.getClusters()[0][ 'name'] # Currently pick first configured cluster # Check connectivity api.connectCluster(cluster_name) self.create_analytics_table(api, cluster_name) self.load_analytics_table(api, cluster_name) self.create_binary_table(api, cluster_name) self.load_binary_table(api, cluster_name)
def __init__(self, request=None, pk=None, impala_data=None, *args, **kwargs): if request is None and pk is None and impala_data is None: return super(VariantSerializer, self).__init__(*args, **kwargs) # We take the information in the database if don't have it. As we are interested in one variant, we use HBase if impala_data is None: # Documentation: https://github.com/cloudera/hue/blob/master/apps/hbase/src/hbase/api.py hbaseApi = HbaseApi(user=request.user) currentCluster = hbaseApi.getClusters().pop() # We arbitrary take 100 rows (to be able to catch every different alternate. Maybe we should take more, not sure about that (we cannot # set an endkey with the hbase api). Most of the times 100 rows will be way more than enough variant = hbaseApi.getRows(cluster=currentCluster['name'], tableName='variants', columns=['R', 'I', 'F'], startRowKey=pk, numRows=100, prefix=False) else: variant = "raw data we got from impala..." if variant is not None: # We load it in the current object if impala_data is None: json_data = hbaseToJson(variant) else: json_data = parquetToJson(impala_data) d = jsonToSerializerData(json_data, self.fields, 'variants') d['calls'] = [] for variants_call in json_data['variants.calls[]']: call = VariantCallSerializer(variantcall_data=variants_call) d['calls'].append(call.data) # Load a specific variant kwargs['data'] = d super(VariantSerializer, self).__init__(*args, **kwargs) self.is_valid()
def put(self, request, original_variant, pk): # Allow to modify a variant in HBase/Impala f = open('/tmp/superhello.txt', 'w') f.write(json.dumps(request.data)) f.close() # We convert the original and modified data to flatjson fc = formatConverters(input_file='stuff.json', output_file='stuff.json') original_flatjson = fc.convertVariantJsonToFlatJson( json_data=original_variant) modified_flatjson = fc.convertVariantJsonToFlatJson( json_data=request.data) # We convert the data to hbase, and we modify directly some fields (note: the keys are almost the same for hbase and impala) hbase_data = fc.convertVariantFlatJsonToHbase( original_data=original_flatjson, modified_data=modified_flatjson) # Impala - We make the query query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) query = hql_query("INSERT INTO variant(" + ",".join(query_data) + ")") handle = db.execute_and_wait(query, timeout_sec=5.0) if handle: db.close(handle) else: raise Exception("Impossible to create the variant...") # HBase - We add the data in that table too hbaseApi = HbaseApi(user=request.user) currentCluster = hbaseApi.getClusters().pop() rowkey = pk hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data)
def test_security_kerberos(): tmpdir = tempfile.mkdtemp() finish = HBASE_CONF_DIR.set_for_testing(tmpdir) try: xml = hbase_site_xml(authentication='kerberos') file(os.path.join(tmpdir, 'hbase-site.xml'), 'w').write(xml) reset() assert_equal('KERBEROS', get_server_authentication()) assert_equal('test', get_server_principal()) security = HbaseApi._get_security() assert_equal('test', security['kerberos_principal_short_name']) assert_equal(True, security['use_sasl']) finally: reset() finish() shutil.rmtree(tmpdir)
def config_validator(user): res = [] from hbase.api import HbaseApi from hbase.settings import NICE_NAME try: if not 'test' in sys.argv: # Avoid tests hanging api = HbaseApi(user=user) cluster_name = api.getClusters()[0]['name'] # Currently pick first configured cluster # Check connectivity api.connectCluster(cluster_name) api.getTableList(cluster_name) except Exception, e: print e if 'Could not connect' in str(e): msg = "The application won't work without a running HBase Thrift Server v1." else: msg = 'Failed to authenticate to HBase Thrift Server, check authentication configurations.' LOG.exception(msg) res.append((NICE_NAME, _(msg)))
def import_of_vcf(request, filename, length): # It is in charge to import a vcf (convert the vcf to avro, etc.), and as it is not fast, we should call # this method asynchronously7 # Connexion to the db try: query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) except Exception: return False hbaseApi = HbaseApi(user=request.user) currentCluster = hbaseApi.getClusters().pop() # To analyze the content of the vcf, we need to get it from the hdfs to this node buffer = min(length, 1024 * 1024 * 512) tmp_filename = 'import_' + request.user.username + '_' + str( int(time.time())) f = open('/tmp/cgs_' + tmp_filename + '.vcf', mode='w') for offset in xrange(0, length, buffer): tmp_vcf = request.fs.read(path='/user/' + request.user.username + '/' + filename, offset=offset, length=buffer, bufsize=buffer) f.write(tmp_vcf) f.close() # Now we try to analyze the vcf a little bit more with the correct tool st = time.time() convert = formatConverters(input_file='/tmp/cgs_' + tmp_filename + '.vcf', output_file='/tmp/cgs_' + tmp_filename + '.json', input_type='vcf', output_type='jsonflat') status, columns, ids_of_samples, rowkeys = convert.convertVcfToFlatJson( request=request, initial_file=filename) f = open('/tmp/cgs_superhello.txt', 'w') f.write('EXECUTION TIME to flat json:' + str(time.time() - st) + '\n') f.close() # We put the output on hdfs json_size = os.path.getsize('/tmp/cgs_' + tmp_filename + '.json') buffer = min(json_size, 1024 * 1024 * 50) st = time.time() with open('/tmp/cgs_' + tmp_filename + '.json', 'r') as content_file: request.fs.create('/user/cgs/' + tmp_filename + '.json', overwrite=True, data='') for offset in xrange(0, json_size, buffer): ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Pushing flatjson to hdfs (/user/cgs/' + tmp_filename + ')... ' + str(time.time() - st) + '\n') ftmp.close() request.fs.append('/user/cgs/' + tmp_filename + '.json', data=content_file.read(buffer)) # We eventually modify the avsc file with the new calls (well, in fact, we get the basic schema # and we will data from the existing db) avro_schema = {} with open('myapps/variants/variants.avsc', 'r') as content_file: avro_schema = json.loads(content_file.read()) with open('/tmp/cgs_' + tmp_filename + '.avsc', 'w') as f: f.write(json.dumps(avro_schema)) existing_columns = [] for field in avro_schema['fields']: existing_columns.append(field['name']) modified_avro_schema = False specific_columns = [] # Used below for the import in impala/hive for sample_id in ids_of_samples: destination_field = 'I_CALL_' + sample_id if destination_field not in specific_columns: specific_columns.append(destination_field) if destination_field not in existing_columns: # The current sample does not exist yet in the avsc file, we need to add it call_schema = { "name": destination_field, "type": ["string", "null"], "doc": "Column for a specific sample" } avro_schema['fields'].append(call_schema) existing_columns.append(destination_field) modified_avro_schema = True if modified_avro_schema is True: with open('/tmp/cgs_' + tmp_filename + '.avsc', 'w') as content_file: content_file.write(json.dumps(avro_schema)) request.fs.create('/user/cgs/cgs_' + tmp_filename + '.avsc', overwrite=True, data=json.dumps(avro_schema)) # We convert the flat json to hbase (mostly a key mapping) st = time.time() convert = formatConverters(input_file='/tmp/cgs_' + tmp_filename + '.json', output_file='/tmp/cgs_' + tmp_filename + '.hbase', input_type='jsonflat', output_type='hbase') status = convert.convertFlatJsonToHbase() ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Conversion from flatjson to hbase... ' + str(time.time() - st) + '\n') ftmp.close() # We put the hbase file on hdfs hbase_length = os.path.getsize('/tmp/cgs_' + tmp_filename + '.hbase') buffer = min(hbase_length, 1024 * 1024 * 50) st = time.time() with open('/tmp/cgs_' + tmp_filename + '.hbase', 'r') as content_file: request.fs.create('/user/cgs/' + tmp_filename + '.hbase', overwrite=True, data='') for offset in xrange(0, hbase_length, buffer): cont = content_file.read(buffer) ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Pushing hbase to hdfs (/user/cgs/' + tmp_filename + '.hbase)... ' + str(time.time() - st) + '\n') ftmp.close() request.fs.append('/user/cgs/' + tmp_filename + '.hbase', data=cont) # We convert the hbase to avro file st = time.time() convert = formatConverters( input_file='/tmp/cgs_' + tmp_filename + '.hbase', output_file='/tmp/cgs_' + tmp_filename + '.avro', input_type='jsonflat', output_type='avro') status = convert.convertHbaseToAvro(avscFile='/tmp/cgs_' + tmp_filename + '.avsc') ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Conversion from hbase to avro... ' + str(time.time() - st) + '\n') ftmp.close() # We put the avro file on hdfs st = time.time() avro_length = os.path.getsize('/tmp/cgs_' + tmp_filename + '.avro') buffer = min(avro_length, 1024 * 1024 * 50) with open('/tmp/cgs_' + tmp_filename + '.avro', 'r') as content_file: request.fs.create('/user/cgs/' + tmp_filename + '.avro', overwrite=True, data='') request.fs.create('/user/cgs/' + tmp_filename + '.archive.avro', overwrite=True, data='') for offset in xrange(0, avro_length, buffer): cont = content_file.read(buffer) ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Pushing avro to hdfs (/user/cgs/' + tmp_filename + '.avro)... ' + str(time.time() - st) + '\n') ftmp.close() request.fs.append('/user/cgs/' + tmp_filename + '.avro', data=cont) request.fs.append('/user/cgs/' + tmp_filename + '.archive.avro', data=cont) tmpf = open('/tmp/cgs_superhello.txt', 'a') # O: We get the columns from the parquet table to detect missing columns for the new calls we just created query = hql_query("show column stats variants") handle = db.execute_and_wait(query, timeout_sec=30.0) data = db.fetch(handle, rows=1000000) rows = list(data.rows()) columns_for_new_calls = [] existing_calls_columns = [] for row in rows: current_column = row[0] if current_column.startswith('i_call_'): existing_calls_columns.append(str(current_column).lower()) for current_sample in ids_of_samples: destination_field = str('I_CALL_' + current_sample).lower() if destination_field not in existing_calls_columns and destination_field not in columns_for_new_calls: columns_for_new_calls.append(destination_field) tmpf.write("Existing calls: " + json.dumps(existing_calls_columns) + "\r\n") tmpf.write("New calls: " + json.dumps(columns_for_new_calls)) tmpf.close() # 1st: we create a temporary hive table with avro storage st = time.time() result, variants_table = database_create_variants( request, temporary=True, specific_columns=specific_columns) tmpf = open('/tmp/cgs_superhello.txt', 'a') # 2nd: we import the previously created avro file inside the temporary avro table query_server = get_query_server_config(name='hive') hive_db = dbms.get(request.user, query_server=query_server) variants_columns = [] for variants_column in variants_table: variants_columns.append(str(variants_column).split(' ').pop(0)) query = hql_query("load data inpath '/user/cgs/" + tmp_filename + ".avro' into table variants_tmp_" + request.user.username + ";") handle = hive_db.execute_and_wait(query, timeout_sec=3600.0) # Necessary for impala to detect an hive table query = hql_query("invalidate metadata;") handle = db.execute_and_wait(query, timeout_sec=30.0) # 3rd: we eventually modify the global parquet table to add the eventual new columns for each call if len(columns_for_new_calls) > 0: query = hql_query("alter table variants add columns (" + ' STRING, '.join(columns_for_new_calls) + " STRING)") handle = db.execute_and_wait(query, timeout_sec=3600.0) # 4th: we import the data from the temporary avro table to the global parquet table query = hql_query("insert into table variants (" + ','.join(variants_columns) + ") select " + ','.join(variants_columns) + " from variants_tmp_" + request.user.username + " ;") handle = db.execute_and_wait(query, timeout_sec=3600.0) # 5th: we delete the temporary table #query = hql_query("drop table variants_tmp_"+request.user.username+";") #handle = hive_db.execute_and_wait(query, timeout_sec=30.0) ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write( 'Creation of temporary table, import to global variants table (parquet): ' + str(time.time() - st) + '\n') ftmp.close() st = time.time() # We put the data in HBase. For now we do it simply, but we should use the bulk upload (TODO) with open('/tmp/cgs_' + tmp_filename + '.hbase', 'r') as content_file: for line in content_file: # We create the json content hbase_data = json.loads(line) rowkey = hbase_data['rowkey'] del hbase_data['rowkey'] del hbase_data['pk'] # We can save the new variant hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data) ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Import into HBase: ' + str(time.time() - st) + '\n') ftmp.close() # We delete the temporary file previously created on this node os.remove('/tmp/cgs_' + tmp_filename + '.avsc') os.remove('/tmp/cgs_' + tmp_filename + '.vcf') os.remove('/tmp/cgs_' + tmp_filename + '.json') os.remove('/tmp/cgs_' + tmp_filename + '.avro') os.remove('/tmp/cgs_' + tmp_filename + '.hbase') return True
## ## Insert various data into HBase ## ## cd $HUE_HOME (e.g. cd /usr/share/hue(/opt/cloudera/parcels/CDH-XXXXX/share/hue if using parcels)) ## build/env/bin/hue shell ## from hbase.api import HbaseApi HbaseApi().putRow('Cluster', 'events', 'hue-20130801', {'doc:txt': 'Hue is awesome!'}) HbaseApi().putRow('Cluster', 'events', 'hue-20130801', {'doc:json': '{"user": "******", "coolness": "extra"}'}) HbaseApi().putRow('Cluster', 'events', 'hue-20130802', {'doc:version': '<xml>I like HBase</xml>'}) HbaseApi().putRow('Cluster', 'events', 'hue-20130802', {'doc:version': '<xml>I LOVE HBase</xml>'}) ## From https://github.com/romainr/hadoop-tutorials-examples ## cd /tmp ## git clone https://github.com/romainr/hadoop-tutorials-examples.git root = '/tmp/hadoop-tutorials-examples' HbaseApi().putRow( 'Cluster', 'events', 'hue-20130801', {'doc:img': open(root + '/hbase-tables/data/hue-logo.png', "rb").read()}) HbaseApi().putRow('Cluster', 'events', 'hue-20130801', { 'doc:html': open(root + '/hbase-tables/data/gethue.com.html', "rb").read()
def import_of_vcf(request, filename, length): # It is in charge to import a vcf (convert the vcf to avro, etc.), and as it is not fast, we should call # this method asynchronously7 # Connexion to the db try: query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) except Exception: return False hbaseApi = HbaseApi(user=request.user) currentCluster = hbaseApi.getClusters().pop() # To analyze the content of the vcf, we need to get it from the hdfs to this node buffer = min(length,1024*1024*512) tmp_filename = 'import_'+request.user.username+'_'+str(int(time.time())) f = open('/tmp/cgs_'+tmp_filename+'.vcf',mode='w') for offset in xrange(0, length, buffer): tmp_vcf = request.fs.read(path='/user/'+request.user.username+'/'+filename, offset=offset, length=buffer, bufsize=buffer) f.write(tmp_vcf) f.close() # Now we try to analyze the vcf a little bit more with the correct tool st = time.time() convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.vcf',output_file='/tmp/cgs_'+tmp_filename+'.json',input_type='vcf',output_type='jsonflat') status, columns, ids_of_samples, rowkeys = convert.convertVcfToFlatJson(request=request, initial_file=filename) f = open('/tmp/cgs_superhello.txt','w') f.write('EXECUTION TIME to flat json:'+str(time.time()-st)+'\n') f.close() # We put the output on hdfs json_size = os.path.getsize('/tmp/cgs_'+tmp_filename+'.json') buffer = min(json_size, 1024*1024*50) st = time.time() with open('/tmp/cgs_'+tmp_filename+'.json', 'r') as content_file: request.fs.create('/user/cgs/'+tmp_filename+'.json', overwrite=True, data='') for offset in xrange(0, json_size, buffer): ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Pushing flatjson to hdfs (/user/cgs/'+tmp_filename+')... '+str(time.time()-st)+'\n') ftmp.close() request.fs.append('/user/cgs/'+tmp_filename+'.json', data=content_file.read(buffer)) # We eventually modify the avsc file with the new calls (well, in fact, we get the basic schema # and we will data from the existing db) avro_schema = {} with open('myapps/variants/variants.avsc','r') as content_file: avro_schema = json.loads(content_file.read()) with open('/tmp/cgs_'+tmp_filename+'.avsc','w') as f: f.write(json.dumps(avro_schema)) existing_columns = [] for field in avro_schema['fields']: existing_columns.append(field['name']) modified_avro_schema = False specific_columns = [] # Used below for the import in impala/hive for sample_id in ids_of_samples: destination_field = 'I_CALL_'+sample_id if destination_field not in specific_columns: specific_columns.append(destination_field) if destination_field not in existing_columns: # The current sample does not exist yet in the avsc file, we need to add it call_schema = {"name":destination_field,"type":["string","null"],"doc":"Column for a specific sample"} avro_schema['fields'].append(call_schema) existing_columns.append(destination_field) modified_avro_schema = True if modified_avro_schema is True: with open('/tmp/cgs_'+tmp_filename+'.avsc','w') as content_file: content_file.write(json.dumps(avro_schema)) request.fs.create('/user/cgs/cgs_'+tmp_filename+'.avsc', overwrite=True, data=json.dumps(avro_schema)) # We convert the flat json to hbase (mostly a key mapping) st = time.time() convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.json',output_file='/tmp/cgs_'+tmp_filename+'.hbase',input_type='jsonflat',output_type='hbase') status = convert.convertFlatJsonToHbase() ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Conversion from flatjson to hbase... '+str(time.time()-st)+'\n') ftmp.close() # We put the hbase file on hdfs hbase_length = os.path.getsize('/tmp/cgs_'+tmp_filename+'.hbase') buffer = min(hbase_length,1024*1024*50) st = time.time() with open('/tmp/cgs_'+tmp_filename+'.hbase', 'r') as content_file: request.fs.create('/user/cgs/'+tmp_filename+'.hbase', overwrite=True, data='') for offset in xrange(0, hbase_length, buffer): cont = content_file.read(buffer) ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Pushing hbase to hdfs (/user/cgs/'+tmp_filename+'.hbase)... '+str(time.time()-st)+'\n') ftmp.close() request.fs.append('/user/cgs/'+tmp_filename+'.hbase', data=cont) # We convert the hbase to avro file st = time.time() convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.hbase',output_file='/tmp/cgs_'+tmp_filename+'.avro',input_type='jsonflat',output_type='avro') status = convert.convertHbaseToAvro(avscFile='/tmp/cgs_'+tmp_filename+'.avsc') ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Conversion from hbase to avro... '+str(time.time()-st)+'\n') ftmp.close() # We put the avro file on hdfs st = time.time() avro_length = os.path.getsize('/tmp/cgs_'+tmp_filename+'.avro') buffer = min(avro_length, 1024*1024*50) with open('/tmp/cgs_'+tmp_filename+'.avro', 'r') as content_file: request.fs.create('/user/cgs/'+tmp_filename+'.avro', overwrite=True, data='') request.fs.create('/user/cgs/'+tmp_filename+'.archive.avro', overwrite=True, data='') for offset in xrange(0, avro_length, buffer): cont = content_file.read(buffer) ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Pushing avro to hdfs (/user/cgs/'+tmp_filename+'.avro)... '+str(time.time()-st)+'\n') ftmp.close() request.fs.append('/user/cgs/'+tmp_filename+'.avro', data=cont) request.fs.append('/user/cgs/'+tmp_filename+'.archive.avro', data=cont) tmpf = open('/tmp/cgs_superhello.txt','a') # O: We get the columns from the parquet table to detect missing columns for the new calls we just created query = hql_query("show column stats variants") handle = db.execute_and_wait(query, timeout_sec=30.0) data = db.fetch(handle, rows=1000000) rows = list(data.rows()) columns_for_new_calls = [] existing_calls_columns = [] for row in rows: current_column = row[0] if current_column.startswith('i_call_'): existing_calls_columns.append(str(current_column).lower()) for current_sample in ids_of_samples: destination_field = str('I_CALL_'+current_sample).lower() if destination_field not in existing_calls_columns and destination_field not in columns_for_new_calls: columns_for_new_calls.append(destination_field) tmpf.write("Existing calls: "+json.dumps(existing_calls_columns)+"\r\n") tmpf.write("New calls: "+json.dumps(columns_for_new_calls)) tmpf.close() # 1st: we create a temporary hive table with avro storage st = time.time() result, variants_table = database_create_variants(request, temporary=True, specific_columns=specific_columns) tmpf = open('/tmp/cgs_superhello.txt','a') # 2nd: we import the previously created avro file inside the temporary avro table query_server = get_query_server_config(name='hive') hive_db = dbms.get(request.user, query_server=query_server) variants_columns = [] for variants_column in variants_table: variants_columns.append(str(variants_column).split(' ').pop(0)) query = hql_query("load data inpath '/user/cgs/"+tmp_filename+".avro' into table variants_tmp_"+request.user.username+";") handle = hive_db.execute_and_wait(query, timeout_sec=3600.0) # Necessary for impala to detect an hive table query = hql_query("invalidate metadata;") handle = db.execute_and_wait(query, timeout_sec=30.0) # 3rd: we eventually modify the global parquet table to add the eventual new columns for each call if len(columns_for_new_calls) > 0: query = hql_query("alter table variants add columns ("+' STRING, '.join(columns_for_new_calls)+" STRING)") handle = db.execute_and_wait(query, timeout_sec=3600.0) # 4th: we import the data from the temporary avro table to the global parquet table query = hql_query("insert into table variants ("+','.join(variants_columns)+") select "+','.join(variants_columns)+" from variants_tmp_"+request.user.username+" ;") handle = db.execute_and_wait(query, timeout_sec=3600.0) # 5th: we delete the temporary table #query = hql_query("drop table variants_tmp_"+request.user.username+";") #handle = hive_db.execute_and_wait(query, timeout_sec=30.0) ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Creation of temporary table, import to global variants table (parquet): '+str(time.time()-st)+'\n') ftmp.close() st = time.time() # We put the data in HBase. For now we do it simply, but we should use the bulk upload (TODO) with open('/tmp/cgs_'+tmp_filename+'.hbase', 'r') as content_file: for line in content_file: # We create the json content hbase_data = json.loads(line) rowkey = hbase_data['rowkey'] del hbase_data['rowkey'] del hbase_data['pk'] # We can save the new variant hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data) ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Import into HBase: '+str(time.time()-st)+'\n') ftmp.close() # We delete the temporary file previously created on this node os.remove('/tmp/cgs_'+tmp_filename+'.avsc') os.remove('/tmp/cgs_'+tmp_filename+'.vcf') os.remove('/tmp/cgs_'+tmp_filename+'.json') os.remove('/tmp/cgs_'+tmp_filename+'.avro') os.remove('/tmp/cgs_'+tmp_filename+'.hbase') return True