Пример #1
0
    def autocomplete(self,
                     snippet,
                     database=None,
                     table=None,
                     column=None,
                     nested=None):
        db = HbaseApi(self.user)
        cluster_name = database

        response = {}

        try:
            if database is None:
                response['databases'] = [
                    cluster['name'] for cluster in db.getClusters()
                ]
            elif table is None:
                tables_meta = db.getTableList(cluster_name)
                response['tables_meta'] = [
                    _table['name'] for _table in tables_meta
                    if _table['enabled']
                ]
            elif column is None:
                tables_meta = db.get(cluster_name, table)
                response['columns'] = []
            else:
                raise PopupException('Could not find column `%s`.`%s`.`%s`' %
                                     (database, table, column))
        except Exception, e:
            LOG.warn('Autocomplete data fetching error: %s' % e)
            response['code'] = 500
            response['error'] = e.message
Пример #2
0
    def put(self, request, original_variant, pk):
        # Allow to modify a variant in HBase/Impala

        f = open('/tmp/superhello.txt','w')
        f.write(json.dumps(request.data))
        f.close()

        # We convert the original and modified data to flatjson
        fc = formatConverters(input_file='stuff.json',output_file='stuff.json')
        original_flatjson = fc.convertVariantJsonToFlatJson(json_data=original_variant)
        modified_flatjson = fc.convertVariantJsonToFlatJson(json_data=request.data)

        # We convert the data to hbase, and we modify directly some fields (note: the keys are almost the same for hbase and impala)
        hbase_data = fc.convertVariantFlatJsonToHbase(original_data=original_flatjson,modified_data=modified_flatjson)


        # Impala - We make the query
        query_server = get_query_server_config(name='impala')
        db = dbms.get(request.user, query_server=query_server)
        query = hql_query("INSERT INTO variant("+",".join(query_data)+")")
        handle = db.execute_and_wait(query, timeout_sec=5.0)
        if handle:
            db.close(handle)
        else:
            raise Exception("Impossible to create the variant...")

        # HBase - We add the data in that table too
        hbaseApi = HbaseApi(user=request.user)
        currentCluster = hbaseApi.getClusters().pop()
        rowkey = pk
        hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data)
Пример #3
0
    def __init__(self, request=None, pk=None, impala_data=None, *args, **kwargs):
        if request is None and pk is None and impala_data is None:
            return super(VariantSerializer, self).__init__(*args, **kwargs)

        # We take the information in the database if don't have it. As we are interested in one variant, we use HBase
        if impala_data is None:
            # Documentation: https://github.com/cloudera/hue/blob/master/apps/hbase/src/hbase/api.py
            hbaseApi = HbaseApi(user=request.user)
            currentCluster = hbaseApi.getClusters().pop()

            # We arbitrary take 100 rows (to be able to catch every different alternate. Maybe we should take more, not sure about that (we cannot
            # set an endkey with the hbase api). Most of the times 100 rows will be way more than enough
            variant = hbaseApi.getRows(cluster=currentCluster['name'], tableName='variants', columns=['R','I','F'], startRowKey=pk, numRows=100, prefix=False)
        else:
            variant = "raw data we got from impala..."

        if variant is not None:
            # We load it in the current object
            if impala_data is None:
                json_data = hbaseToJson(variant)
            else:
                json_data = parquetToJson(impala_data)
            d = jsonToSerializerData(json_data, self.fields, 'variants')

            d['calls'] = []
            for variants_call in json_data['variants.calls[]']:
                call = VariantCallSerializer(variantcall_data=variants_call)
                d['calls'].append(call.data)

            # Load a specific variant
            kwargs['data'] = d
            super(VariantSerializer, self).__init__(*args, **kwargs)

            self.is_valid()
Пример #4
0
  def handle_noargs(self, **options):
    api = HbaseApi()
    cluster_name = api.getClusters()[0]['name'] # Currently pick first configured cluster

    # Check connectivity
    api.connectCluster(cluster_name)

    self.create_analytics_table(api, cluster_name)
    self.load_analytics_table(api, cluster_name)

    self.create_binary_table(api, cluster_name)
    self.load_binary_table(api, cluster_name)
Пример #5
0
    def __init__(self, pUser):

        self.api = HbaseApi(pUser)
        try:
            self.clusters = self.api.getClusters()
        except Exception, e:
            if 'Could not connect to' in e.message:
                raise PopupException(
                    _("HBase DAO Thrift 1 server cannot be contacted: %s") %
                    e.message)
            else:
                error_msg = e.message.split('\n', 1)[0]
                raise PopupException(_("HBase DAO Error: %s") % error_msg)
Пример #6
0
def api_router(request, url):  # On split, deserialize anything
    def safe_json_load(raw):
        try:
            return json.loads(re.sub(r'(?:\")([0-9]+)(?:\")', r'\1', str(raw)))
        except:
            return raw

    def deserialize(data):
        if type(data) == dict:
            special_type = get_thrift_type(data.pop('hue-thrift-type', ''))
            if special_type:
                return special_type(data)

        if hasattr(data, "__iter__"):
            for i, item in enumerate(data):
                data[i] = deserialize(
                    item)  # Sets local binding, needs to set in data
        return data

    decoded_url_params = [
        urllib.unquote(arg) for arg in re.split(r'(?<!\\)/', url.strip('/'))
    ]
    url_params = [
        safe_json_load(
            (arg, request.POST.get(arg[0:16],
                                   arg))[arg[0:15] == 'hbase-post-key-'])
        for arg in decoded_url_params
    ]  # Deserialize later

    if request.POST.get('dest', False):
        url_params += [request.FILES.get(request.REQUEST.get('dest'))]

    return api_dump(HbaseApi(request.user).query(*url_params))
Пример #7
0
  def test_list_tables(self):
    if not is_live_cluster():
      raise SkipTest('HUE-2910: Skipping because test is not reentrant')

    for cluster in HbaseApi(self.user).getClusters():
      resp = self.client.post('/hbase/api/getTableList/' + cluster['name'])
      content = json.loads(resp.content)
      assert_true('data' in content, content)
Пример #8
0
  def handle(self, *args, **options):
    if args:
      user = args[0]
    else:
      user = install_sample_user()

    api = HbaseApi(user=user)
    cluster_name = api.getClusters()[0]['name'] # Currently pick first configured cluster

    # Check connectivity
    api.connectCluster(cluster_name)

    self.create_analytics_table(api, cluster_name)
    self.load_analytics_table(api, cluster_name)

    self.create_binary_table(api, cluster_name)
    self.load_binary_table(api, cluster_name)
Пример #9
0
def config_validator(user):
    res = []

    from hbase.api import HbaseApi
    from hbase.settings import NICE_NAME

    try:
        if not 'test' in sys.argv:  # Avoid tests hanging
            api = HbaseApi(user=user)
            cluster_name = api.getClusters()[0][
                'name']  # Currently pick first configured cluster
            # Check connectivity
            api.connectCluster(cluster_name)
            api.getTableList(cluster_name)
    except Exception as e:
        print(e)
        if 'Could not connect' in str(e):
            msg = "The application won't work without a running HBase Thrift Server v1."
        else:
            msg = 'Failed to authenticate to HBase Thrift Server, check authentication configurations.'
        LOG.exception(msg)
        res.append((NICE_NAME, _(msg)))

    if get_thrift_transport() == "framed":
        msg = "Hbase config thrift_transport=framed is not supported"
        LOG.exception(msg)
        res.append((NICE_NAME, _(msg)))

    res.extend(validate_thrift_transport(THRIFT_TRANSPORT))

    return res
Пример #10
0
    def handle(self, *args, **options):
        if args:
            user = args[0]
        else:
            user = install_sample_user()

        api = HbaseApi(user=user)
        cluster_name = api.getClusters()[0][
            'name']  # Currently pick first configured cluster

        # Check connectivity
        api.connectCluster(cluster_name)

        self.create_analytics_table(api, cluster_name)
        self.load_analytics_table(api, cluster_name)

        self.create_binary_table(api, cluster_name)
        self.load_binary_table(api, cluster_name)
Пример #11
0
    def __init__(self,
                 request=None,
                 pk=None,
                 impala_data=None,
                 *args,
                 **kwargs):
        if request is None and pk is None and impala_data is None:
            return super(VariantSerializer, self).__init__(*args, **kwargs)

        # We take the information in the database if don't have it. As we are interested in one variant, we use HBase
        if impala_data is None:
            # Documentation: https://github.com/cloudera/hue/blob/master/apps/hbase/src/hbase/api.py
            hbaseApi = HbaseApi(user=request.user)
            currentCluster = hbaseApi.getClusters().pop()

            # We arbitrary take 100 rows (to be able to catch every different alternate. Maybe we should take more, not sure about that (we cannot
            # set an endkey with the hbase api). Most of the times 100 rows will be way more than enough
            variant = hbaseApi.getRows(cluster=currentCluster['name'],
                                       tableName='variants',
                                       columns=['R', 'I', 'F'],
                                       startRowKey=pk,
                                       numRows=100,
                                       prefix=False)
        else:
            variant = "raw data we got from impala..."

        if variant is not None:
            # We load it in the current object
            if impala_data is None:
                json_data = hbaseToJson(variant)
            else:
                json_data = parquetToJson(impala_data)
            d = jsonToSerializerData(json_data, self.fields, 'variants')

            d['calls'] = []
            for variants_call in json_data['variants.calls[]']:
                call = VariantCallSerializer(variantcall_data=variants_call)
                d['calls'].append(call.data)

            # Load a specific variant
            kwargs['data'] = d
            super(VariantSerializer, self).__init__(*args, **kwargs)

            self.is_valid()
Пример #12
0
    def put(self, request, original_variant, pk):
        # Allow to modify a variant in HBase/Impala

        f = open('/tmp/superhello.txt', 'w')
        f.write(json.dumps(request.data))
        f.close()

        # We convert the original and modified data to flatjson
        fc = formatConverters(input_file='stuff.json',
                              output_file='stuff.json')
        original_flatjson = fc.convertVariantJsonToFlatJson(
            json_data=original_variant)
        modified_flatjson = fc.convertVariantJsonToFlatJson(
            json_data=request.data)

        # We convert the data to hbase, and we modify directly some fields (note: the keys are almost the same for hbase and impala)
        hbase_data = fc.convertVariantFlatJsonToHbase(
            original_data=original_flatjson, modified_data=modified_flatjson)

        # Impala - We make the query
        query_server = get_query_server_config(name='impala')
        db = dbms.get(request.user, query_server=query_server)
        query = hql_query("INSERT INTO variant(" + ",".join(query_data) + ")")
        handle = db.execute_and_wait(query, timeout_sec=5.0)
        if handle:
            db.close(handle)
        else:
            raise Exception("Impossible to create the variant...")

        # HBase - We add the data in that table too
        hbaseApi = HbaseApi(user=request.user)
        currentCluster = hbaseApi.getClusters().pop()
        rowkey = pk
        hbaseApi.putRow(cluster=currentCluster['name'],
                        tableName='variants',
                        row=rowkey,
                        data=hbase_data)
Пример #13
0
def test_security_kerberos():
  tmpdir = tempfile.mkdtemp()
  finish = HBASE_CONF_DIR.set_for_testing(tmpdir)

  try:
    xml = hbase_site_xml(authentication='kerberos')
    file(os.path.join(tmpdir, 'hbase-site.xml'), 'w').write(xml)
    reset()

    assert_equal('KERBEROS', get_server_authentication())
    assert_equal('test', get_server_principal())

    security = HbaseApi._get_security()

    assert_equal('test', security['kerberos_principal_short_name'])
    assert_equal(True, security['use_sasl'])
  finally:
    reset()
    finish()
    shutil.rmtree(tmpdir)
Пример #14
0
def test_security_kerberos():
    tmpdir = tempfile.mkdtemp()
    finish = HBASE_CONF_DIR.set_for_testing(tmpdir)

    try:
        xml = hbase_site_xml(authentication='kerberos')
        file(os.path.join(tmpdir, 'hbase-site.xml'), 'w').write(xml)
        reset()

        assert_equal('KERBEROS', get_server_authentication())
        assert_equal('test', get_server_principal())

        security = HbaseApi._get_security()

        assert_equal('test', security['kerberos_principal_short_name'])
        assert_equal(True, security['use_sasl'])
    finally:
        reset()
        finish()
        shutil.rmtree(tmpdir)
Пример #15
0
Файл: conf.py Проект: 10sr/hue
def config_validator(user):
  res = []

  from hbase.api import HbaseApi
  from hbase.settings import NICE_NAME

  try:
    if not 'test' in sys.argv: # Avoid tests hanging
      api = HbaseApi(user=user)
      cluster_name = api.getClusters()[0]['name'] # Currently pick first configured cluster
      # Check connectivity
      api.connectCluster(cluster_name)
      api.getTableList(cluster_name)
  except Exception, e:
    print e
    if 'Could not connect' in str(e):
      msg = "The application won't work without a running HBase Thrift Server v1."
    else:
      msg = 'Failed to authenticate to HBase Thrift Server, check authentication configurations.'
    LOG.exception(msg)
    res.append((NICE_NAME, _(msg)))
Пример #16
0
def import_of_vcf(request, filename, length):
    # It is in charge to import a vcf (convert the vcf to avro, etc.), and as it is not fast, we should call
    # this method asynchronously7

    # Connexion to the db
    try:
        query_server = get_query_server_config(name='impala')
        db = dbms.get(request.user, query_server=query_server)
    except Exception:
        return False

    hbaseApi = HbaseApi(user=request.user)
    currentCluster = hbaseApi.getClusters().pop()

    # To analyze the content of the vcf, we need to get it from the hdfs to this node
    buffer = min(length, 1024 * 1024 * 512)
    tmp_filename = 'import_' + request.user.username + '_' + str(
        int(time.time()))
    f = open('/tmp/cgs_' + tmp_filename + '.vcf', mode='w')
    for offset in xrange(0, length, buffer):
        tmp_vcf = request.fs.read(path='/user/' + request.user.username + '/' +
                                  filename,
                                  offset=offset,
                                  length=buffer,
                                  bufsize=buffer)
        f.write(tmp_vcf)
    f.close()

    # Now we try to analyze the vcf a little bit more with the correct tool
    st = time.time()
    convert = formatConverters(input_file='/tmp/cgs_' + tmp_filename + '.vcf',
                               output_file='/tmp/cgs_' + tmp_filename +
                               '.json',
                               input_type='vcf',
                               output_type='jsonflat')
    status, columns, ids_of_samples, rowkeys = convert.convertVcfToFlatJson(
        request=request, initial_file=filename)
    f = open('/tmp/cgs_superhello.txt', 'w')
    f.write('EXECUTION TIME to flat json:' + str(time.time() - st) + '\n')
    f.close()

    # We put the output on hdfs
    json_size = os.path.getsize('/tmp/cgs_' + tmp_filename + '.json')
    buffer = min(json_size, 1024 * 1024 * 50)
    st = time.time()
    with open('/tmp/cgs_' + tmp_filename + '.json', 'r') as content_file:
        request.fs.create('/user/cgs/' + tmp_filename + '.json',
                          overwrite=True,
                          data='')
        for offset in xrange(0, json_size, buffer):
            ftmp = open('/tmp/cgs_superhello.txt', 'a')
            ftmp.write('Pushing flatjson to hdfs (/user/cgs/' + tmp_filename +
                       ')... ' + str(time.time() - st) + '\n')
            ftmp.close()
            request.fs.append('/user/cgs/' + tmp_filename + '.json',
                              data=content_file.read(buffer))

    # We eventually modify the avsc file with the new calls (well, in fact, we get the basic schema
    # and we will data from the existing db)
    avro_schema = {}
    with open('myapps/variants/variants.avsc', 'r') as content_file:
        avro_schema = json.loads(content_file.read())
        with open('/tmp/cgs_' + tmp_filename + '.avsc', 'w') as f:
            f.write(json.dumps(avro_schema))

    existing_columns = []
    for field in avro_schema['fields']:
        existing_columns.append(field['name'])
    modified_avro_schema = False
    specific_columns = []  # Used below for the import in impala/hive
    for sample_id in ids_of_samples:
        destination_field = 'I_CALL_' + sample_id

        if destination_field not in specific_columns:
            specific_columns.append(destination_field)

        if destination_field not in existing_columns:
            # The current sample does not exist yet in the avsc file, we need to add it
            call_schema = {
                "name": destination_field,
                "type": ["string", "null"],
                "doc": "Column for a specific sample"
            }
            avro_schema['fields'].append(call_schema)
            existing_columns.append(destination_field)
            modified_avro_schema = True

    if modified_avro_schema is True:
        with open('/tmp/cgs_' + tmp_filename + '.avsc', 'w') as content_file:
            content_file.write(json.dumps(avro_schema))

        request.fs.create('/user/cgs/cgs_' + tmp_filename + '.avsc',
                          overwrite=True,
                          data=json.dumps(avro_schema))

    # We convert the flat json to hbase (mostly a key mapping)
    st = time.time()
    convert = formatConverters(input_file='/tmp/cgs_' + tmp_filename + '.json',
                               output_file='/tmp/cgs_' + tmp_filename +
                               '.hbase',
                               input_type='jsonflat',
                               output_type='hbase')
    status = convert.convertFlatJsonToHbase()
    ftmp = open('/tmp/cgs_superhello.txt', 'a')
    ftmp.write('Conversion from flatjson to hbase... ' +
               str(time.time() - st) + '\n')
    ftmp.close()

    # We put the hbase file on hdfs
    hbase_length = os.path.getsize('/tmp/cgs_' + tmp_filename + '.hbase')
    buffer = min(hbase_length, 1024 * 1024 * 50)
    st = time.time()
    with open('/tmp/cgs_' + tmp_filename + '.hbase', 'r') as content_file:
        request.fs.create('/user/cgs/' + tmp_filename + '.hbase',
                          overwrite=True,
                          data='')
        for offset in xrange(0, hbase_length, buffer):
            cont = content_file.read(buffer)
            ftmp = open('/tmp/cgs_superhello.txt', 'a')
            ftmp.write('Pushing hbase to hdfs (/user/cgs/' + tmp_filename +
                       '.hbase)... ' + str(time.time() - st) + '\n')
            ftmp.close()
            request.fs.append('/user/cgs/' + tmp_filename + '.hbase',
                              data=cont)

    # We convert the hbase to avro file
    st = time.time()
    convert = formatConverters(
        input_file='/tmp/cgs_' + tmp_filename + '.hbase',
        output_file='/tmp/cgs_' + tmp_filename + '.avro',
        input_type='jsonflat',
        output_type='avro')
    status = convert.convertHbaseToAvro(avscFile='/tmp/cgs_' + tmp_filename +
                                        '.avsc')

    ftmp = open('/tmp/cgs_superhello.txt', 'a')
    ftmp.write('Conversion from hbase to avro... ' + str(time.time() - st) +
               '\n')
    ftmp.close()

    # We put the avro file on hdfs
    st = time.time()
    avro_length = os.path.getsize('/tmp/cgs_' + tmp_filename + '.avro')
    buffer = min(avro_length, 1024 * 1024 * 50)
    with open('/tmp/cgs_' + tmp_filename + '.avro', 'r') as content_file:
        request.fs.create('/user/cgs/' + tmp_filename + '.avro',
                          overwrite=True,
                          data='')
        request.fs.create('/user/cgs/' + tmp_filename + '.archive.avro',
                          overwrite=True,
                          data='')
        for offset in xrange(0, avro_length, buffer):
            cont = content_file.read(buffer)
            ftmp = open('/tmp/cgs_superhello.txt', 'a')
            ftmp.write('Pushing avro to hdfs (/user/cgs/' + tmp_filename +
                       '.avro)... ' + str(time.time() - st) + '\n')
            ftmp.close()
            request.fs.append('/user/cgs/' + tmp_filename + '.avro', data=cont)
            request.fs.append('/user/cgs/' + tmp_filename + '.archive.avro',
                              data=cont)

    tmpf = open('/tmp/cgs_superhello.txt', 'a')
    # O: We get the columns from the parquet table to detect missing columns for the new calls we just created
    query = hql_query("show column stats variants")
    handle = db.execute_and_wait(query, timeout_sec=30.0)
    data = db.fetch(handle, rows=1000000)
    rows = list(data.rows())
    columns_for_new_calls = []
    existing_calls_columns = []
    for row in rows:
        current_column = row[0]
        if current_column.startswith('i_call_'):
            existing_calls_columns.append(str(current_column).lower())

    for current_sample in ids_of_samples:
        destination_field = str('I_CALL_' + current_sample).lower()
        if destination_field not in existing_calls_columns and destination_field not in columns_for_new_calls:
            columns_for_new_calls.append(destination_field)
    tmpf.write("Existing calls: " + json.dumps(existing_calls_columns) +
               "\r\n")
    tmpf.write("New calls: " + json.dumps(columns_for_new_calls))
    tmpf.close()

    # 1st: we create a temporary hive table with avro storage
    st = time.time()
    result, variants_table = database_create_variants(
        request, temporary=True, specific_columns=specific_columns)

    tmpf = open('/tmp/cgs_superhello.txt', 'a')
    # 2nd: we import the previously created avro file inside the temporary avro table
    query_server = get_query_server_config(name='hive')
    hive_db = dbms.get(request.user, query_server=query_server)
    variants_columns = []
    for variants_column in variants_table:
        variants_columns.append(str(variants_column).split(' ').pop(0))

    query = hql_query("load data inpath '/user/cgs/" + tmp_filename +
                      ".avro' into table variants_tmp_" +
                      request.user.username + ";")
    handle = hive_db.execute_and_wait(query, timeout_sec=3600.0)

    # Necessary for impala to detect an hive table
    query = hql_query("invalidate metadata;")
    handle = db.execute_and_wait(query, timeout_sec=30.0)

    # 3rd: we eventually modify the global parquet table to add the eventual new columns for each call
    if len(columns_for_new_calls) > 0:
        query = hql_query("alter table variants add columns (" +
                          ' STRING, '.join(columns_for_new_calls) + " STRING)")
        handle = db.execute_and_wait(query, timeout_sec=3600.0)

    # 4th: we import the data from the temporary avro table to the global parquet table
    query = hql_query("insert into table variants (" +
                      ','.join(variants_columns) + ") select " +
                      ','.join(variants_columns) + " from variants_tmp_" +
                      request.user.username + " ;")
    handle = db.execute_and_wait(query, timeout_sec=3600.0)

    # 5th: we delete the temporary table
    #query = hql_query("drop table variants_tmp_"+request.user.username+";")
    #handle = hive_db.execute_and_wait(query, timeout_sec=30.0)
    ftmp = open('/tmp/cgs_superhello.txt', 'a')
    ftmp.write(
        'Creation of temporary table, import to global variants table (parquet): '
        + str(time.time() - st) + '\n')
    ftmp.close()

    st = time.time()
    # We put the data in HBase. For now we do it simply, but we should use the bulk upload (TODO)
    with open('/tmp/cgs_' + tmp_filename + '.hbase', 'r') as content_file:
        for line in content_file:
            # We create the json content
            hbase_data = json.loads(line)
            rowkey = hbase_data['rowkey']
            del hbase_data['rowkey']
            del hbase_data['pk']

            # We can save the new variant
            hbaseApi.putRow(cluster=currentCluster['name'],
                            tableName='variants',
                            row=rowkey,
                            data=hbase_data)

    ftmp = open('/tmp/cgs_superhello.txt', 'a')
    ftmp.write('Import into HBase: ' + str(time.time() - st) + '\n')
    ftmp.close()

    # We delete the temporary file previously created on this node
    os.remove('/tmp/cgs_' + tmp_filename + '.avsc')
    os.remove('/tmp/cgs_' + tmp_filename + '.vcf')
    os.remove('/tmp/cgs_' + tmp_filename + '.json')
    os.remove('/tmp/cgs_' + tmp_filename + '.avro')
    os.remove('/tmp/cgs_' + tmp_filename + '.hbase')

    return True
Пример #17
0
##
## Insert various data into HBase
##

## cd $HUE_HOME (e.g. cd /usr/share/hue(/opt/cloudera/parcels/CDH-XXXXX/share/hue if using parcels))
## build/env/bin/hue shell
##

from hbase.api import HbaseApi

HbaseApi().putRow('Cluster', 'events', 'hue-20130801',
                  {'doc:txt': 'Hue is awesome!'})
HbaseApi().putRow('Cluster', 'events', 'hue-20130801',
                  {'doc:json': '{"user": "******", "coolness": "extra"}'})
HbaseApi().putRow('Cluster', 'events', 'hue-20130802',
                  {'doc:version': '<xml>I like HBase</xml>'})
HbaseApi().putRow('Cluster', 'events', 'hue-20130802',
                  {'doc:version': '<xml>I LOVE HBase</xml>'})

## From https://github.com/romainr/hadoop-tutorials-examples
## cd /tmp
## git clone https://github.com/romainr/hadoop-tutorials-examples.git

root = '/tmp/hadoop-tutorials-examples'

HbaseApi().putRow(
    'Cluster', 'events', 'hue-20130801',
    {'doc:img': open(root + '/hbase-tables/data/hue-logo.png', "rb").read()})
HbaseApi().putRow('Cluster', 'events', 'hue-20130801', {
    'doc:html':
    open(root + '/hbase-tables/data/gethue.com.html', "rb").read()
Пример #18
0
def import_of_vcf(request, filename, length):
    # It is in charge to import a vcf (convert the vcf to avro, etc.), and as it is not fast, we should call
    # this method asynchronously7

    # Connexion to the db
    try:
        query_server = get_query_server_config(name='impala')
        db = dbms.get(request.user, query_server=query_server)
    except Exception:
        return False

    hbaseApi = HbaseApi(user=request.user)
    currentCluster = hbaseApi.getClusters().pop()

    # To analyze the content of the vcf, we need to get it from the hdfs to this node
    buffer = min(length,1024*1024*512)
    tmp_filename = 'import_'+request.user.username+'_'+str(int(time.time()))
    f = open('/tmp/cgs_'+tmp_filename+'.vcf',mode='w')
    for offset in xrange(0, length, buffer):
        tmp_vcf = request.fs.read(path='/user/'+request.user.username+'/'+filename, offset=offset, length=buffer, bufsize=buffer)
        f.write(tmp_vcf)
    f.close()

    # Now we try to analyze the vcf a little bit more with the correct tool
    st = time.time()
    convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.vcf',output_file='/tmp/cgs_'+tmp_filename+'.json',input_type='vcf',output_type='jsonflat')
    status, columns, ids_of_samples, rowkeys = convert.convertVcfToFlatJson(request=request, initial_file=filename)
    f = open('/tmp/cgs_superhello.txt','w')
    f.write('EXECUTION TIME to flat json:'+str(time.time()-st)+'\n')
    f.close()

    # We put the output on hdfs
    json_size = os.path.getsize('/tmp/cgs_'+tmp_filename+'.json')
    buffer = min(json_size, 1024*1024*50)
    st = time.time()
    with open('/tmp/cgs_'+tmp_filename+'.json', 'r') as content_file:
        request.fs.create('/user/cgs/'+tmp_filename+'.json', overwrite=True, data='')
        for offset in xrange(0, json_size, buffer):
            ftmp = open('/tmp/cgs_superhello.txt','a')
            ftmp.write('Pushing flatjson to hdfs (/user/cgs/'+tmp_filename+')... '+str(time.time()-st)+'\n')
            ftmp.close()
            request.fs.append('/user/cgs/'+tmp_filename+'.json', data=content_file.read(buffer))

    # We eventually modify the avsc file with the new calls (well, in fact, we get the basic schema
    # and we will data from the existing db)
    avro_schema = {}
    with open('myapps/variants/variants.avsc','r') as content_file:
        avro_schema = json.loads(content_file.read())
        with open('/tmp/cgs_'+tmp_filename+'.avsc','w') as f:
            f.write(json.dumps(avro_schema))

    existing_columns = []
    for field in avro_schema['fields']:
        existing_columns.append(field['name'])
    modified_avro_schema = False
    specific_columns = [] # Used below for the import in impala/hive
    for sample_id in ids_of_samples:
        destination_field = 'I_CALL_'+sample_id

        if destination_field not in specific_columns:
            specific_columns.append(destination_field)

        if destination_field not in existing_columns:
            # The current sample does not exist yet in the avsc file, we need to add it
            call_schema = {"name":destination_field,"type":["string","null"],"doc":"Column for a specific sample"}
            avro_schema['fields'].append(call_schema)
            existing_columns.append(destination_field)
            modified_avro_schema = True

    if modified_avro_schema is True:
        with open('/tmp/cgs_'+tmp_filename+'.avsc','w') as content_file:
            content_file.write(json.dumps(avro_schema))

        request.fs.create('/user/cgs/cgs_'+tmp_filename+'.avsc', overwrite=True, data=json.dumps(avro_schema))

    # We convert the flat json to hbase (mostly a key mapping)
    st = time.time()
    convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.json',output_file='/tmp/cgs_'+tmp_filename+'.hbase',input_type='jsonflat',output_type='hbase')
    status = convert.convertFlatJsonToHbase()
    ftmp = open('/tmp/cgs_superhello.txt','a')
    ftmp.write('Conversion from flatjson to hbase... '+str(time.time()-st)+'\n')
    ftmp.close()

    # We put the hbase file on hdfs
    hbase_length = os.path.getsize('/tmp/cgs_'+tmp_filename+'.hbase')
    buffer = min(hbase_length,1024*1024*50)
    st = time.time()
    with open('/tmp/cgs_'+tmp_filename+'.hbase', 'r') as content_file:
        request.fs.create('/user/cgs/'+tmp_filename+'.hbase', overwrite=True, data='')
        for offset in xrange(0, hbase_length, buffer):
            cont = content_file.read(buffer)
            ftmp = open('/tmp/cgs_superhello.txt','a')
            ftmp.write('Pushing hbase to hdfs (/user/cgs/'+tmp_filename+'.hbase)... '+str(time.time()-st)+'\n')
            ftmp.close()
            request.fs.append('/user/cgs/'+tmp_filename+'.hbase', data=cont)

    # We convert the hbase to avro file
    st = time.time()
    convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.hbase',output_file='/tmp/cgs_'+tmp_filename+'.avro',input_type='jsonflat',output_type='avro')
    status = convert.convertHbaseToAvro(avscFile='/tmp/cgs_'+tmp_filename+'.avsc')

    ftmp = open('/tmp/cgs_superhello.txt','a')
    ftmp.write('Conversion from hbase to avro... '+str(time.time()-st)+'\n')
    ftmp.close()

    # We put the avro file on hdfs
    st = time.time()
    avro_length = os.path.getsize('/tmp/cgs_'+tmp_filename+'.avro')
    buffer = min(avro_length, 1024*1024*50)
    with open('/tmp/cgs_'+tmp_filename+'.avro', 'r') as content_file:
        request.fs.create('/user/cgs/'+tmp_filename+'.avro', overwrite=True, data='')
        request.fs.create('/user/cgs/'+tmp_filename+'.archive.avro', overwrite=True, data='')
        for offset in xrange(0, avro_length, buffer):
            cont = content_file.read(buffer)
            ftmp = open('/tmp/cgs_superhello.txt','a')
            ftmp.write('Pushing avro to hdfs (/user/cgs/'+tmp_filename+'.avro)... '+str(time.time()-st)+'\n')
            ftmp.close()
            request.fs.append('/user/cgs/'+tmp_filename+'.avro', data=cont)
            request.fs.append('/user/cgs/'+tmp_filename+'.archive.avro', data=cont)

    tmpf = open('/tmp/cgs_superhello.txt','a')
    # O: We get the columns from the parquet table to detect missing columns for the new calls we just created
    query = hql_query("show column stats variants")
    handle = db.execute_and_wait(query, timeout_sec=30.0)
    data = db.fetch(handle, rows=1000000)
    rows = list(data.rows())
    columns_for_new_calls = []
    existing_calls_columns = []
    for row in rows:
        current_column = row[0]
        if current_column.startswith('i_call_'):
            existing_calls_columns.append(str(current_column).lower())

    for current_sample in ids_of_samples:
        destination_field = str('I_CALL_'+current_sample).lower()
        if destination_field not in existing_calls_columns and destination_field not in columns_for_new_calls:
            columns_for_new_calls.append(destination_field)
    tmpf.write("Existing calls: "+json.dumps(existing_calls_columns)+"\r\n")
    tmpf.write("New calls: "+json.dumps(columns_for_new_calls))
    tmpf.close()

    # 1st: we create a temporary hive table with avro storage
    st = time.time()
    result, variants_table = database_create_variants(request, temporary=True, specific_columns=specific_columns)

    tmpf = open('/tmp/cgs_superhello.txt','a')
    # 2nd: we import the previously created avro file inside the temporary avro table
    query_server = get_query_server_config(name='hive')
    hive_db = dbms.get(request.user, query_server=query_server)
    variants_columns = []
    for variants_column in variants_table:
        variants_columns.append(str(variants_column).split(' ').pop(0))

    query = hql_query("load data inpath '/user/cgs/"+tmp_filename+".avro' into table variants_tmp_"+request.user.username+";")
    handle = hive_db.execute_and_wait(query, timeout_sec=3600.0)

    # Necessary for impala to detect an hive table
    query = hql_query("invalidate metadata;")
    handle = db.execute_and_wait(query, timeout_sec=30.0)

    # 3rd: we eventually modify the global parquet table to add the eventual new columns for each call
    if len(columns_for_new_calls) > 0:
        query = hql_query("alter table variants add columns ("+' STRING, '.join(columns_for_new_calls)+" STRING)")
        handle = db.execute_and_wait(query, timeout_sec=3600.0)

    # 4th: we import the data from the temporary avro table to the global parquet table
    query = hql_query("insert into table variants ("+','.join(variants_columns)+") select "+','.join(variants_columns)+" from variants_tmp_"+request.user.username+" ;")
    handle = db.execute_and_wait(query, timeout_sec=3600.0)

    # 5th: we delete the temporary table
    #query = hql_query("drop table variants_tmp_"+request.user.username+";")
    #handle = hive_db.execute_and_wait(query, timeout_sec=30.0)
    ftmp = open('/tmp/cgs_superhello.txt','a')
    ftmp.write('Creation of temporary table, import to global variants table (parquet): '+str(time.time()-st)+'\n')
    ftmp.close()

    st = time.time()
    # We put the data in HBase. For now we do it simply, but we should use the bulk upload (TODO)
    with open('/tmp/cgs_'+tmp_filename+'.hbase', 'r') as content_file:
        for line in content_file:
            # We create the json content
            hbase_data = json.loads(line)
            rowkey = hbase_data['rowkey']
            del hbase_data['rowkey']
            del hbase_data['pk']

            # We can save the new variant
            hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data)

    ftmp = open('/tmp/cgs_superhello.txt','a')
    ftmp.write('Import into HBase: '+str(time.time()-st)+'\n')
    ftmp.close()

    # We delete the temporary file previously created on this node
    os.remove('/tmp/cgs_'+tmp_filename+'.avsc')
    os.remove('/tmp/cgs_'+tmp_filename+'.vcf')
    os.remove('/tmp/cgs_'+tmp_filename+'.json')
    os.remove('/tmp/cgs_'+tmp_filename+'.avro')
    os.remove('/tmp/cgs_'+tmp_filename+'.hbase')

    return True