Пример #1
0
    def test_guess_csv_format(self):
        stream = StringIO.StringIO(TestIndexer.simpleCSVString)
        indexer = MorphlineIndexer("test", solr_client=self.solr_client)

        guessed_format = indexer.guess_format(
            {'file': {
                "stream": stream,
                "name": "test.csv"
            }})

        fields = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": "test.csv"
            },
            "format": guessed_format
        })['columns']
        # test format
        expected_format = self.simpleCSVFormat

        assert_equal(expected_format, guessed_format)

        # test fields
        expected_fields = self.simpleCSVFields

        for expected, actual in zip(expected_fields, fields):
            for key in ("name", "type"):
                assert_equal(expected[key], actual[key])
Пример #2
0
def importer_submit(request):
    source = json.loads(request.POST.get('source', '{}'))
    outputFormat = json.loads(request.POST.get('destination',
                                               '{}'))['outputFormat']
    destination = json.loads(request.POST.get('destination', '{}'))
    destination['ouputFormat'] = outputFormat  # Workaround a very weird bug
    start_time = json.loads(request.POST.get('start_time', '-1'))

    if destination['ouputFormat'] == 'index':
        source['columns'] = destination['columns']
        index_name = destination["name"]

        if destination['indexerRunJob']:
            _convert_format(source["format"], inverse=True)
            job_handle = _index(request,
                                source,
                                index_name,
                                start_time=start_time,
                                lib_path=destination['indexerJobLibPath'])
        else:
            client = SolrClient(request.user)
            unique_key_field = destination[
                'indexerDefaultField'] and destination['indexerDefaultField'][
                    0] or None
            df = destination['indexerPrimaryKey'] and destination[
                'indexerPrimaryKey'][0] or None
            kwargs = {}

            stats = request.fs.stats(source["path"])
            if stats.size > MAX_UPLOAD_SIZE:
                raise PopupException(_('File size is too large to handle!'))

            indexer = MorphlineIndexer(request.user, request.fs)
            fields = indexer.get_kept_field_list(source['columns'])
            if not unique_key_field:
                unique_key_field = 'hue_id'
                fields += [{"name": unique_key_field, "type": "string"}]
                kwargs['rowid'] = unique_key_field

            if not client.exists(index_name):
                client.create_index(name=index_name,
                                    fields=fields,
                                    unique_key_field=unique_key_field,
                                    df=df)

            data = request.fs.read(source["path"], 0, MAX_UPLOAD_SIZE)
            client.index(name=index_name, data=data, **kwargs)

            job_handle = {
                'status':
                0,
                'on_success_url':
                reverse('search:browse', kwargs={'name': index_name})
            }
    elif destination['ouputFormat'] == 'database':
        job_handle = _create_database(request, source, destination, start_time)
    else:
        job_handle = _create_table(request, source, destination, start_time)

    return JsonResponse(job_handle)
Пример #3
0
def guess_format(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = MorphlineIndexer(request.user, request.fs)
    if not request.fs.isfile(file_format["path"]):
      raise PopupException(_('Path %(path)s is not a file') % file_format)

    stream = request.fs.open(file_format["path"])
    format_ = indexer.guess_format({
      "file": {
        "stream": stream,
        "name": file_format['path']
      }
    })
    _convert_format(format_)
  elif file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    storage = dict([(delim['data_type'], delim['comment']) for delim in table_metadata.storage_details])
    if table_metadata.details['properties']['format'] == 'text':
      format_ = {"quoteChar": "\"", "recordSeparator": '\\n', "type": "csv", "hasHeader": False, "fieldSeparator": storage['serialization.format']}
    elif table_metadata.details['properties']['format'] == 'parquet':
      format_ = {"type": "parquet", "hasHeader": False,}
    else:
      raise PopupException('Hive table format %s is not supported.' % table_metadata.details['properties']['format'])
  elif file_format['inputFormat'] == 'query':
    format_ = {"quoteChar": "\"", "recordSeparator": "\\n", "type": "csv", "hasHeader": False, "fieldSeparator": "\u0001"}
  elif file_format['inputFormat'] == 'rdbms':
    format_ = RdbmsIndexer(request.user, file_format['rdbmsType']).guess_format()

  format_['status'] = 0
  return JsonResponse(format_)
Пример #4
0
  def test_generate_csv_morphline(self):
    indexer = MorphlineIndexer("test", solr_client=self.solr_client)
    morphline = indexer.generate_morphline_config("test_collection", {
        "columns": deepcopy(self.simpleCSVFields),
        "format": self.simpleCSVFormat
      })

    assert_true(isinstance(morphline, basestring))
Пример #5
0
def guess_field_types(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = MorphlineIndexer(request.user, request.fs)
    stream = request.fs.open(file_format["path"])
    _convert_format(file_format["format"], inverse=True)

    format_ = indexer.guess_field_types({
      "file": {
          "stream": stream,
          "name": file_format['path']
        },
      "format": file_format['format']
    })
  elif file_format['inputFormat'] == 'table':
    sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName'])
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    format_ = {
        "sample": sample['rows'][:4],
        "columns": [
            Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict()
            for col in table_metadata.cols
        ]
    }
  elif file_format['inputFormat'] == 'query': # Only support open query history
    # TODO get schema from explain query, which is not possible
    notebook = Notebook(document=Document2.objects.get(id=file_format['query'])).get_data()
    snippet = notebook['snippets'][0]
    sample = get_api(request, snippet).fetch_result(notebook, snippet, 4, start_over=True)

    format_ = {
        "sample": sample['rows'][:4],
        "sample_cols": sample.meta,
        "columns": [
            Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
            for col in sample.meta
        ]
    }
  elif file_format['inputFormat'] == 'rdbms':
    query_server = rdbms.get_query_server_config(server=file_format['rdbmsType'])
    db = rdbms.get(request.user, query_server=query_server)
    sample = RdbmsIndexer(request.user, file_format['rdbmsType']).get_sample_data(mode=file_format['rdbmsMode'], database=file_format['rdbmsDatabaseName'], table=file_format['rdbmsTableName'])
    table_metadata = db.get_columns(file_format['rdbmsDatabaseName'], file_format['rdbmsTableName'], names_only=False)

    format_ = {
        "sample": list(sample['rows'])[:4],
        "columns": [
            Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
            for col in table_metadata
        ]
    }

  return JsonResponse(format_)
Пример #6
0
  def _test_fixed_type_format_generate_morphline(self, format_):
    indexer = MorphlineIndexer("test", solr_client=self.solr_client)
    format_instance = format_()

    morphline = indexer.generate_morphline_config("test_collection", {
        "columns": [field.to_dict() for field in format_instance.fields],
        "format": format_instance.get_format()
      })

    assert_true(isinstance(morphline, basestring))
Пример #7
0
def _create_index(user, fs, client, source, destination, index_name):
    unique_key_field = destination['indexerPrimaryKey'] and destination[
        'indexerPrimaryKey'][0] or None
    df = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None
    kwargs = {}

    if source['inputFormat'] not in ('manual', 'table'):
        stats = fs.stats(source["path"])
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            config_name=destination.get('indexerConfigSet'),
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])

    if source['inputFormat'] not in ('manual', 'table'):
        data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE)
        try:
            client.index(name=index_name, data=data, **kwargs)
        except Exception, e:
            try:
                client.delete_index(index_name, keep_config=False)
            except Exception, e2:
                LOG.warn(
                    'Error while cleaning-up config of failed collection creation %s: %s'
                    % (index_name, e2))
            raise e
Пример #8
0
  def _test_generate_field_operation_morphline(self, operation_format):
    fields = deepcopy(TestIndexer.simpleCSVFields)
    fields[0]['operations'].append(operation_format)

    indexer = MorphlineIndexer("test", solr_client=self.solr_client)
    morphline =indexer.generate_morphline_config("test_collection", {
        "columns": fields,
        "format": TestIndexer.simpleCSVFormat
      })

    assert_true(isinstance(morphline, basestring))
Пример #9
0
def guess_field_types(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = MorphlineIndexer(request.user, request.fs)
    stream = request.fs.open(file_format["path"])
    _convert_format(file_format["format"], inverse=True)

    format_ = indexer.guess_field_types({
      "file": {
          "stream": stream,
          "name": file_format['path']
        },
      "format": file_format['format']
    })
  elif file_format['inputFormat'] == 'table':
    sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName'])
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    format_ = {
        "sample": sample['rows'][:4],
        "columns": [
            Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict()
            for col in table_metadata.cols
        ]
    }
  elif file_format['inputFormat'] == 'query':
    query_id = file_format['query']['id'] if file_format['query'].get('id') else file_format['query']

    notebook = Notebook(document=Document2.objects.document(user=request.user, doc_id=query_id)).get_data()
    snippet = notebook['snippets'][0]
    db = get_api(request, snippet)

    if file_format.get('sampleCols'):
      columns = file_format.get('sampleCols')
      sample = file_format.get('sample')
    else:
      snippet['query'] = snippet['statement']
      try:
        sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4]
      except Exception, e:
        LOG.warn('Skipping sample data as query handle might be expired: %s' % e)
        sample = [[], [], [], [], []]
      columns = db.autocomplete(snippet=snippet, database='', table='')
      columns = [
          Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
          for col in columns['extended_columns']
      ]
    format_ = {
        "sample": sample,
        "columns": columns,
    }
Пример #10
0
def _create_index(user, fs, client, source, destination, index_name):
    unique_key_field = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None
    df = destination['indexerPrimaryKey'] and destination['indexerPrimaryKey'][
        0] or None
    kwargs = {}

    if source['inputFormat'] != 'manual':
        stats = fs.stats(source["path"])
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])

    if source['inputFormat'] != 'manual':
        data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE)
        client.index(name=index_name, data=data, **kwargs)

    return {
        'status': 0,
        'on_success_url': reverse('indexer:indexes',
                                  kwargs={'index': index_name}),
        'pub_sub_url': 'assist.collections.refresh'
    }
Пример #11
0
def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None, destination=None):
  indexer = MorphlineIndexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  client = SolrClient(user=request.user)

  if not client.exists(collection_name) and not request.POST.get('show_command'): # if destination['isTargetExisting']:
    client.create_index(
      name=collection_name,
      fields=request.POST.get('fields', schema_fields),
      unique_key_field=unique_field
      # No df currently
    )
  else:
    # TODO: check if format matches
    pass

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'stream' and file_format['streamSelection'] == 'flume':
    indexer = FlumeIndexer(user=request.user)
    if request.POST.get('show_command'):
      configs = indexer.generate_config(file_format, destination)
      return {'status': 0, 'commands': configs[-1]}
    else:
      return indexer.start(collection_name, file_format, destination)
  elif file_format['inputFormat'] == 'stream':
    return _envelope_job(request, file_format, destination, start_time=start_time, lib_path=lib_path)
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % urllib_unquote(file_format["path"])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path)

  return indexer.run_morphline(
      request,
      collection_name,
      morphline,
      input_path,
      query,
      start_time=start_time,
      lib_path=lib_path
  )
Пример #12
0
  def test_guess_format_invalid_csv_format(self):
    indexer = MorphlineIndexer("test", solr_client=self.solr_client)
    stream = StringIO.StringIO(TestIndexer.simpleCSVString)

    guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    guessed_format["fieldSeparator"] = "invalid separator"

    fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
    assert_equal(fields, [])

    stream.seek(0)
    guessed_format = indexer.guess_format({'file':  {"stream": stream, "name": "test.csv"}})

    guessed_format["recordSeparator"] = "invalid separator"

    fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
    assert_equal(fields, [])

    stream.seek(0)
    guessed_format = indexer.guess_format({'file':  {"stream": stream, "name": "test.csv"}})

    guessed_format["quoteChar"] = "invalid quoteChar"

    fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
    assert_equal(fields, [])
Пример #13
0
def guess_format(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        path = urllib.unquote(file_format["path"])
        indexer = MorphlineIndexer(request.user, request.fs)
        if not request.fs.isfile(path):
            raise PopupException(
                _('Path %(path)s is not a file') % file_format)

        stream = request.fs.open(path)
        format_ = indexer.guess_format(
            {"file": {
                "stream": stream,
                "name": path
            }})
        _convert_format(format_)
    elif file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        try:
            table_metadata = db.get_table(database=file_format['databaseName'],
                                          table_name=file_format['tableName'])
        except Exception, e:
            raise PopupException(
                e.message if hasattr(e, 'message') and e.message else e)
        storage = {}
        for delim in table_metadata.storage_details:
            if delim['data_type']:
                if '=' in delim['data_type']:
                    key, val = delim['data_type'].split('=', 1)
                    storage[key] = val
                else:
                    storage[delim['data_type']] = delim['comment']
        if table_metadata.details['properties']['format'] == 'text':
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "csv",
                "hasHeader": False,
                "fieldSeparator": storage.get('field.delim', ',')
            }
        elif table_metadata.details['properties']['format'] == 'parquet':
            format_ = {
                "type": "parquet",
                "hasHeader": False,
            }
        else:
            raise PopupException(
                'Hive table format %s is not supported.' %
                table_metadata.details['properties']['format'])
Пример #14
0
  def test_end_to_end(self):
    if not is_live_cluster(): # Skipping as requires morplines libs to be setup
      raise SkipTest()

    cluster = shared_cluster()
    fs = cluster.fs
    make_logged_in_client(username="******", groupname="default", recreate=True, is_superuser=False)
    user = User.objects.get(username="******")
    collection_name = "test_collection"
    indexer = MorphlineIndexer("test", fs=fs, jt=cluster.jt, solr_client=self.solr_client)
    input_loc = "/tmp/test.csv"

    # upload the test file to hdfs
    fs.create(input_loc, data=TestIndexer.simpleCSVString, overwrite=True)

    # open a filestream for the file on hdfs
    stream = fs.open(input_loc)

    # guess the format of the file
    file_type_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})

    field_types = indexer.guess_field_types({"file":{"stream": stream, "name": "test.csv"}, "format": file_type_format})

    format_ = field_types.copy()
    format_['format'] = file_type_format

    # find a field name available to use for the record's uuid
    unique_field = indexer.get_unique_field(format_)
    is_unique_generated = indexer.is_unique_generated(format_)

    # generate morphline
    morphline = indexer.generate_morphline_config(collection_name, format_, unique_field)

    schema_fields = indexer.get_kept_field_list(format_['columns'])
    if is_unique_generated:
      schema_fields += [{"name": unique_field, "type": "string"}]


    # create the collection from the specified fields
    collection_manager = CollectionManagerController("test")
    if collection_manager.collection_exists(collection_name):
      collection_manager.delete_collection(collection_name, None)
    collection_manager.create_collection(collection_name, schema_fields, unique_key_field=unique_field)

    # index the file
    indexer.run_morphline(MockedRequest(user=user, fs=cluster.fs, jt=cluster.jt), collection_name, morphline, input_loc)
Пример #15
0
def _small_indexing(user, fs, client, source, destination, index_name):
  kwargs = {}
  errors = []

  if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
    path = urllib.unquote(source["path"])
    stats = fs.stats(path)
    if stats.size > MAX_UPLOAD_SIZE:
      raise PopupException(_('File size is too large to handle!'))

  indexer = MorphlineIndexer(user, fs)

  fields = indexer.get_field_list(destination['columns'])
  _create_solr_collection(user, fs, client, destination, index_name, kwargs)

  if source['inputFormat'] == 'file':
    path = urllib.unquote(source["path"])
    data = fs.read(path, 0, MAX_UPLOAD_SIZE)

  if client.is_solr_six_or_more():
    kwargs['processor'] = 'tolerant'
    kwargs['map'] = 'NULL:'

  try:
    if source['inputFormat'] == 'query':
      query_id = source['query']['id'] if source['query'].get('id') else source['query']

      notebook = Notebook(document=Document2.objects.document(user=user, doc_id=query_id)).get_data()
      request = MockedDjangoRequest(user=user)
      snippet = notebook['snippets'][0]

      searcher = CollectionManagerController(user)
      columns = [field['name'] for field in fields if field['name'] != 'hue_id']
      fetch_handle = lambda rows, start_over: get_api(request, snippet).fetch_result(notebook, snippet, rows=rows, start_over=start_over) # Assumes handle still live
      rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs)
      # TODO if rows == MAX_ROWS truncation warning
    elif source['inputFormat'] == 'manual':
      pass # No need to do anything
    else:
      response = client.index(name=index_name, data=data, **kwargs)
      errors = [error.get('message', '') for error in response['responseHeader'].get('errors', [])]
  except Exception, e:
    try:
      client.delete_index(index_name, keep_config=False)
    except Exception, e2:
      LOG.warn('Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
Пример #16
0
def _index(request, file_format, collection_name, query=None, start_time=None, lib_path=None):
  indexer = MorphlineIndexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  client = SolrClient(user=request.user)

  if not client.exists(collection_name):
    client.create_index(
      name=collection_name,
      fields=request.POST.get('fields', schema_fields),
      unique_key_field=unique_field
    )

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % file_format["path"]
  elif file_format['inputFormat'] == 'hs2_handle':
    searcher = CollectionManagerController(request.user)
    columns = ['_uuid'] + [field['name'] for field in file_format['columns']]
    return searcher.update_data_from_hive(collection_name, columns, fetch_handle=file_format['fetch_handle'])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path)

  return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
Пример #17
0
def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None):
  indexer = MorphlineIndexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  client = SolrClient(user=request.user)

  if not client.exists(collection_name):
    client.create_index(
      name=collection_name,
      fields=request.POST.get('fields', schema_fields),
      unique_key_field=unique_field
      # No df currently
    )

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % urllib.unquote(file_format["path"])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path)

  return indexer.run_morphline(request, collection_name, morphline, input_path, query, start_time=start_time, lib_path=lib_path)
Пример #18
0
def _create_solr_collection(user, fs, client, destination, index_name, kwargs):
    unique_key_field = destination['indexerPrimaryKey'] and destination[
        'indexerPrimaryKey'][0] or None
    df = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    for field in fields:
        for operation in field['operations']:
            if operation['type'] == 'split':
                field[
                    'multiValued'] = True  # Solr requires multiValued to be set when splitting
                kwargs['f.%(name)s.split' % field] = 'true'
                kwargs['f.%(name)s.separator' %
                       field] = operation['settings']['splitChar'] or ','

    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            config_name=destination.get('indexerConfigSet'),
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])
Пример #19
0
Файл: api3.py Проект: hkj123/hue
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = MorphlineIndexer(request.user, request.fs)
        path = urllib_unquote(file_format["path"])
        stream = request.fs.open(path)
        encoding = check_encoding(stream.read(10000))
        stream.seek(0)
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": path
            },
            "format": file_format['format']
        })

        # Note: Would also need to set charset to table (only supported in Hive)
        if 'sample' in format_ and format_['sample']:
            format_['sample'] = escape_rows(format_['sample'],
                                            nulls_only=True,
                                            encoding=encoding)
        for col in format_['columns']:
            col['name'] = smart_unicode(col['name'],
                                        errors='replace',
                                        encoding=encoding)

    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format['inputFormat'] == 'query':
        query_id = file_format['query']['id'] if file_format['query'].get(
            'id') else file_format['query']

        notebook = Notebook(document=Document2.objects.document(
            user=request.user, doc_id=query_id)).get_data()
        snippet = notebook['snippets'][0]
        db = get_api(request, snippet)

        if file_format.get('sampleCols'):
            columns = file_format.get('sampleCols')
            sample = file_format.get('sample')
        else:
            snippet['query'] = snippet['statement']
            try:
                sample = db.fetch_result(notebook, snippet, 4,
                                         start_over=True)['rows'][:4]
            except Exception as e:
                LOG.warning(
                    'Skipping sample data as query handle might be expired: %s'
                    % e)
                sample = [[], [], [], [], []]
            columns = db.autocomplete(snippet=snippet, database='', table='')
            columns = [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in columns['extended_columns']
            ]
        format_ = {
            "sample": sample,
            "columns": columns,
        }
    elif file_format['inputFormat'] == 'rdbms':
        api = _get_api(request)
        sample = api.get_sample_data(None,
                                     database=file_format['rdbmsDatabaseName'],
                                     table=file_format['tableName'])

        format_ = {
            "sample":
            list(sample['rows'])[:4],
            "columns": [
                Field(col['name'], col['type']).to_dict()
                for col in sample['full_headers']
            ]
        }
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            data = get_topic_data(request.user,
                                  file_format.get('kafkaSelectedTopics'))

            kafkaFieldNames = [col['name'] for col in data['full_headers']]
            kafkaFieldTypes = [col['type'] for col in data['full_headers']]
            topics_data = data['rows']

            format_ = {
                "sample":
                topics_data,
                "columns": [
                    Field(col, 'string', unique=False).to_dict()
                    for col in kafkaFieldNames
                ]
            }


#       data = """%(kafkaFieldNames)s
# %(data)s""" % {
#         'kafkaFieldNames': ','.join(kafkaFieldNames),
#         'data': '\n'.join([','.join(cols) for cols in topics_data])
#       }
#       stream = string_io()
#       stream.write(data)

#       _convert_format(file_format["format"], inverse=True)

#       indexer = MorphlineIndexer(request.user, request.fs)

#       format_ = indexer.guess_field_types({
#         "file": {
#             "stream": stream,
#             "name": file_format['path']
#         },
#         "format": file_format['format']
#       })
#       type_mapping = dict(
#         list(
#           zip(kafkaFieldNames, kafkaFieldTypes)
#         )
#       )

#       for col in format_['columns']:
#         col['keyType'] = type_mapping[col['name']]
#         col['type'] = type_mapping[col['name']]
        elif file_format['streamSelection'] == 'flume':
            if 'hue-httpd/access_log' in file_format['channelSourcePath']:
                columns = [{
                    'name': 'id',
                    'type': 'string',
                    'unique': True
                }, {
                    'name': 'client_ip',
                    'type': 'string'
                }, {
                    'name': 'time',
                    'type': 'date'
                }, {
                    'name': 'request',
                    'type': 'string'
                }, {
                    'name': 'code',
                    'type': 'plong'
                }, {
                    'name': 'bytes',
                    'type': 'plong'
                }, {
                    'name': 'method',
                    'type': 'string'
                }, {
                    'name': 'url',
                    'type': 'string'
                }, {
                    'name': 'protocol',
                    'type': 'string'
                }, {
                    'name': 'app',
                    'type': 'string'
                }, {
                    'name': 'subapp',
                    'type': 'string'
                }]
            else:
                columns = [{'name': 'message', 'type': 'string'}]

            format_ = {
                "sample": [['...'] * len(columns)] * 4,
                "columns": [
                    Field(col['name'],
                          HiveFormat.FIELD_TYPE_TRANSLATE.get(
                              col['type'], 'string'),
                          unique=col.get('unique')).to_dict()
                    for col in columns
                ]
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            table_metadata = [{
                'name': column['name'],
                'type': column['type']
            } for column in sf.restful('sobjects/%(streamObject)s/describe/' %
                                       file_format)['fields']]
            query = 'SELECT %s FROM %s LIMIT 4' % (', '.join(
                [col['name']
                 for col in table_metadata]), file_format['streamObject'])
            print(query)

            try:
                records = sf.query_all(query)
            except SalesforceRefusedRequest as e:
                raise PopupException(message=str(e))

            format_ = {
                "sample":
                [list(row.values())[1:] for row in records['records']],
                "columns": [
                    Field(
                        col['name'],
                        HiveFormat.FIELD_TYPE_TRANSLATE.get(
                            col['type'], 'string')).to_dict()
                    for col in table_metadata
                ]
            }
        else:
            raise PopupException(
                _('Connector format not recognized: %(connectorSelection)s') %
                file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    return JsonResponse(format_)
Пример #20
0
Файл: api3.py Проект: hkj123/hue
def guess_format(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        path = urllib_unquote(file_format["path"])
        indexer = MorphlineIndexer(request.user, request.fs)
        if not request.fs.isfile(path):
            raise PopupException(
                _('Path %(path)s is not a file') % file_format)

        stream = request.fs.open(path)
        format_ = indexer.guess_format(
            {"file": {
                "stream": stream,
                "name": path
            }})
        _convert_format(format_)
    elif file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        try:
            table_metadata = db.get_table(database=file_format['databaseName'],
                                          table_name=file_format['tableName'])
        except Exception as e:
            raise PopupException(
                e.message if hasattr(e, 'message') and e.message else e)
        storage = {}
        for delim in table_metadata.storage_details:
            if delim['data_type']:
                if '=' in delim['data_type']:
                    key, val = delim['data_type'].split('=', 1)
                    storage[key] = val
                else:
                    storage[delim['data_type']] = delim['comment']
        if table_metadata.details['properties']['format'] == 'text':
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "csv",
                "hasHeader": False,
                "fieldSeparator": storage.get('field.delim', ',')
            }
        elif table_metadata.details['properties']['format'] == 'parquet':
            format_ = {
                "type": "parquet",
                "hasHeader": False,
            }
        else:
            raise PopupException(
                'Hive table format %s is not supported.' %
                table_metadata.details['properties']['format'])
    elif file_format['inputFormat'] == 'query':
        format_ = {
            "quoteChar": "\"",
            "recordSeparator": "\\n",
            "type": "csv",
            "hasHeader": False,
            "fieldSeparator": "\u0001"
        }
    elif file_format['inputFormat'] == 'rdbms':
        format_ = {"type": "csv"}
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            format_ = {
                "type": "json",
                # "fieldSeparator": ",",
                # "hasHeader": True,
                # "quoteChar": "\"",
                # "recordSeparator": "\\n",
                'topics': get_topics(request.user)
            }
        elif file_format['streamSelection'] == 'flume':
            format_ = {
                "type": "csv",
                "fieldSeparator": ",",
                "hasHeader": True,
                "quoteChar": "\"",
                "recordSeparator": "\\n"
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            format_ = {
                "type":
                "csv",
                "fieldSeparator":
                ",",
                "hasHeader":
                True,
                "quoteChar":
                "\"",
                "recordSeparator":
                "\\n",
                'objects': [
                    sobject['name']
                    for sobject in sf.restful('sobjects/')['sobjects']
                    if sobject['queryable']
                ]
            }
        else:
            raise PopupException(
                _('Input format %(inputFormat)s connector not recognized: $(connectorSelection)s'
                  ) % file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    format_['status'] = 0
    return JsonResponse(format_)
Пример #21
0
def guess_field_types(request):
  file_format = json.loads(request.POST.get('fileFormat', '{}'))

  if file_format['inputFormat'] == 'file':
    indexer = MorphlineIndexer(request.user, request.fs)
    path = urllib.unquote(file_format["path"])
    stream = request.fs.open(path)
    encoding = chardet.detect(stream.read(10000)).get('encoding')
    stream.seek(0)
    _convert_format(file_format["format"], inverse=True)

    format_ = indexer.guess_field_types({
      "file": {
          "stream": stream,
          "name": path
        },
      "format": file_format['format']
    })

    # Note: Would also need to set charset to table (only supported in Hive)
    if 'sample' in format_:
      format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding)
    for col in format_['columns']:
      col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding)

  elif file_format['inputFormat'] == 'table':
    sample = get_api(request, {'type': 'hive'}).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName'])
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])

    format_ = {
        "sample": sample['rows'][:4],
        "columns": [
            Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict()
            for col in table_metadata.cols
        ]
    }
  elif file_format['inputFormat'] == 'query':
    query_id = file_format['query']['id'] if file_format['query'].get('id') else file_format['query']

    notebook = Notebook(document=Document2.objects.document(user=request.user, doc_id=query_id)).get_data()
    snippet = notebook['snippets'][0]
    db = get_api(request, snippet)

    if file_format.get('sampleCols'):
      columns = file_format.get('sampleCols')
      sample = file_format.get('sample')
    else:
      snippet['query'] = snippet['statement']
      try:
        sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4]
      except Exception, e:
        LOG.warn('Skipping sample data as query handle might be expired: %s' % e)
        sample = [[], [], [], [], []]
      columns = db.autocomplete(snippet=snippet, database='', table='')
      columns = [
          Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict()
          for col in columns['extended_columns']
      ]
    format_ = {
        "sample": sample,
        "columns": columns,
    }
Пример #22
0
        ]
    }
  elif file_format['inputFormat'] == 'stream':
    # Note: mocked here, should come from SFDC or Kafka API or sampling job
    if file_format['streamSelection'] == 'kafka':
      data = """%(kafkaFieldNames)s
%(data)s""" % {
        'kafkaFieldNames': file_format.get('kafkaFieldNames', ''),
        'data': '\n'.join([','.join(['...'] * len(file_format.get('kafkaFieldTypes', '').split(',')))] * 5)
      }
      stream = StringIO.StringIO()
      stream.write(data)

      _convert_format(file_format["format"], inverse=True)

      indexer = MorphlineIndexer(request.user, request.fs)
      format_ = indexer.guess_field_types({
        "file": {
            "stream": stream,
            "name": file_format['path']
          },
        "format": file_format['format']
      })

      type_mapping = dict(zip(file_format['kafkaFieldNames'].split(','), file_format['kafkaFieldTypes'].split(',')))
      for col in format_['columns']:
        col['keyType'] = type_mapping[col['name']]
        col['type'] = type_mapping[col['name']]
    elif file_format['streamSelection'] == 'sfdc':
      sf = Salesforce(
          username=file_format['streamUsername'],
Пример #23
0
def _envelope_job(request,
                  file_format,
                  destination,
                  start_time=None,
                  lib_path=None):
    collection_name = destination['name']
    indexer = EnvelopeIndexer(request.user, request.fs)

    lib_path = None  # Todo optional input field
    input_path = None

    if file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])
        input_path = table_metadata.path_location
    elif file_format['inputFormat'] == 'file':
        input_path = file_format["path"]
        properties = {'input_path': input_path, 'format': 'csv'}
    elif file_format['inputFormat'] == 'stream' and file_format[
            'streamSelection'] == 'flume':
        pass
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            manager = ManagerApi()
            properties = {
                "brokers": manager.get_kafka_brokers(),
                "topics": file_format['kafkaSelectedTopics'],
                "kafkaFieldType": file_format['kafkaFieldType'],
                "kafkaFieldDelimiter": file_format['kafkaFieldDelimiter'],
            }

            if file_format.get(
                    'kafkaSelectedTopics') == 'NavigatorAuditEvents':
                schema_fields = MorphlineIndexer.get_kept_field_list(
                    file_format['sampleCols'])
                properties.update({
                    "kafkaFieldNames":
                    ', '.join([_field['name'] for _field in schema_fields]),
                    "kafkaFieldTypes":
                    ', '.join([_field['type'] for _field in schema_fields])
                })
            else:
                properties.update({
                    "kafkaFieldNames":
                    file_format['kafkaFieldNames'],
                    "kafkaFieldTypes":
                    file_format['kafkaFieldTypes']
                })

            if True:
                properties['window'] = ''
            else:  # For "KafkaSQL"
                properties['window'] = '''
            window {
                enabled = true
                milliseconds = 60000
            }'''
    elif file_format['inputFormat'] == 'connector':
        if file_format['streamSelection'] == 'flume':
            properties = {
                'streamSelection':
                file_format['streamSelection'],
                'channelSourceHosts':
                file_format['channelSourceHosts'],
                'channelSourceSelectedHosts':
                file_format['channelSourceSelectedHosts'],
                'channelSourcePath':
                file_format['channelSourcePath'],
            }
        else:
            # sfdc
            properties = {
                'streamSelection': file_format['streamSelection'],
                'streamUsername': file_format['streamUsername'],
                'streamPassword': file_format['streamPassword'],
                'streamToken': file_format['streamToken'],
                'streamEndpointUrl': file_format['streamEndpointUrl'],
                'streamObject': file_format['streamObject'],
            }

    if destination['outputFormat'] == 'table':
        if destination['isTargetExisting']:  # Todo: check if format matches
            pass
        else:
            destination['importData'] = False  # Avoid LOAD DATA
            if destination['tableFormat'] == 'kudu':
                properties['kafkaFieldNames'] = properties[
                    'kafkaFieldNames'].lower(
                    )  # Kudu names should be all lowercase
            # Create table
            if not request.POST.get('show_command'):
                SQLIndexer(user=request.user,
                           fs=request.fs).create_table_from_a_file(
                               file_format, destination).execute(request)

        if destination['tableFormat'] == 'kudu':
            manager = ManagerApi()
            properties["output_table"] = "impala::%s" % collection_name
            properties["kudu_master"] = manager.get_kudu_master()
        else:
            properties['output_table'] = collection_name
    elif destination['outputFormat'] == 'stream':
        manager = ManagerApi()
        properties['brokers'] = manager.get_kafka_brokers()
        properties['topics'] = file_format['kafkaSelectedTopics']
        properties['kafkaFieldDelimiter'] = file_format['kafkaFieldDelimiter']
    elif destination['outputFormat'] == 'file':
        properties['path'] = file_format["path"]
        if file_format['inputFormat'] == 'stream':
            properties['format'] = 'csv'
        else:
            properties['format'] = file_format['tableFormat']  # or csv
    elif destination['outputFormat'] == 'index':
        properties['collectionName'] = collection_name
        properties['connection'] = SOLR_URL.get()

    properties["app_name"] = 'Data Ingest'
    properties["inputFormat"] = file_format['inputFormat']
    properties["ouputFormat"] = destination['ouputFormat']
    properties["streamSelection"] = file_format["streamSelection"]

    configs = indexer.generate_config(properties)

    if request.POST.get('show_command'):
        return {'status': 0, 'commands': configs['envelope.conf']}
    else:
        return indexer.run(request,
                           collection_name,
                           configs,
                           input_path,
                           start_time=start_time,
                           lib_path=lib_path)
Пример #24
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = MorphlineIndexer(request.user, request.fs)
        path = urllib_unquote(file_format["path"])
        stream = request.fs.open(path)
        encoding = chardet.detect(stream.read(10000)).get('encoding')
        stream.seek(0)
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": path
            },
            "format": file_format['format']
        })

        # Note: Would also need to set charset to table (only supported in Hive)
        if 'sample' in format_ and format_['sample']:
            format_['sample'] = escape_rows(format_['sample'],
                                            nulls_only=True,
                                            encoding=encoding)
        for col in format_['columns']:
            col['name'] = smart_unicode(col['name'],
                                        errors='replace',
                                        encoding=encoding)

    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format['inputFormat'] == 'query':
        query_id = file_format['query']['id'] if file_format['query'].get(
            'id') else file_format['query']

        notebook = Notebook(document=Document2.objects.document(
            user=request.user, doc_id=query_id)).get_data()
        snippet = notebook['snippets'][0]
        db = get_api(request, snippet)

        if file_format.get('sampleCols'):
            columns = file_format.get('sampleCols')
            sample = file_format.get('sample')
        else:
            snippet['query'] = snippet['statement']
            try:
                sample = db.fetch_result(notebook, snippet, 4,
                                         start_over=True)['rows'][:4]
            except Exception as e:
                LOG.warn(
                    'Skipping sample data as query handle might be expired: %s'
                    % e)
                sample = [[], [], [], [], []]
            columns = db.autocomplete(snippet=snippet, database='', table='')
            columns = [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in columns['extended_columns']
            ]
        format_ = {
            "sample": sample,
            "columns": columns,
        }
    elif file_format['inputFormat'] == 'rdbms':
        api = _get_api(request)
        sample = api.get_sample_data(None,
                                     database=file_format['rdbmsDatabaseName'],
                                     table=file_format['tableName'])

        format_ = {
            "sample":
            list(sample['rows'])[:4],
            "columns": [
                Field(col['name'], col['type']).to_dict()
                for col in sample['full_headers']
            ]
        }
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            if file_format.get(
                    'kafkaSelectedTopics') == 'NavigatorAuditEvents':
                kafkaFieldNames = [
                    'id', 'additionalInfo', 'allowed', 'collectionName',
                    'databaseName', 'db', 'DELEGATION_TOKEN_ID', 'dst',
                    'entityId', 'family', 'impersonator', 'ip', 'name',
                    'objectType', 'objType', 'objUsageType', 'operationParams',
                    'operationText', 'op', 'opText', 'path', 'perms',
                    'privilege', 'qualifier', 'QUERY_ID', 'resourcePath',
                    'service', 'SESSION_ID', 'solrVersion', 'src', 'status',
                    'subOperation', 'tableName', 'table', 'time', 'type',
                    'url', 'user'
                ]
                kafkaFieldTypes = ['string'] * len(kafkaFieldNames)
                kafkaFieldNames.append('timeDate')
                kafkaFieldTypes.append('date')
            else:
                # Note: mocked here, should come from SFDC or Kafka API or sampling job
                kafkaFieldNames = file_format.get('kafkaFieldNames',
                                                  '').split(',')
                kafkaFieldTypes = file_format.get('kafkaFieldTypes',
                                                  '').split(',')

            data = """%(kafkaFieldNames)s
%(data)s""" % {
                'kafkaFieldNames': ','.join(kafkaFieldNames),
                'data': '\n'.join(
                    [','.join(['...'] * len(kafkaFieldTypes))] * 5)
            }
            stream = string_io()
            stream.write(data)

            _convert_format(file_format["format"], inverse=True)

            indexer = MorphlineIndexer(request.user, request.fs)
            format_ = indexer.guess_field_types({
                "file": {
                    "stream": stream,
                    "name": file_format['path']
                },
                "format": file_format['format']
            })
            type_mapping = dict(list(zip(kafkaFieldNames, kafkaFieldTypes)))

            for col in format_['columns']:
                col['keyType'] = type_mapping[col['name']]
                col['type'] = type_mapping[col['name']]
        elif file_format['streamSelection'] == 'flume':
            if 'hue-httpd/access_log' in file_format['channelSourcePath']:
                columns = [{
                    'name': 'id',
                    'type': 'string',
                    'unique': True
                }, {
                    'name': 'client_ip',
                    'type': 'string'
                }, {
                    'name': 'time',
                    'type': 'date'
                }, {
                    'name': 'request',
                    'type': 'string'
                }, {
                    'name': 'code',
                    'type': 'plong'
                }, {
                    'name': 'bytes',
                    'type': 'plong'
                }, {
                    'name': 'method',
                    'type': 'string'
                }, {
                    'name': 'url',
                    'type': 'string'
                }, {
                    'name': 'protocol',
                    'type': 'string'
                }, {
                    'name': 'app',
                    'type': 'string'
                }, {
                    'name': 'subapp',
                    'type': 'string'
                }]
            else:
                columns = [{'name': 'message', 'type': 'string'}]

            format_ = {
                "sample": [['...'] * len(columns)] * 4,
                "columns": [
                    Field(col['name'],
                          HiveFormat.FIELD_TYPE_TRANSLATE.get(
                              col['type'], 'string'),
                          unique=col.get('unique')).to_dict()
                    for col in columns
                ]
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            table_metadata = [{
                'name': column['name'],
                'type': column['type']
            } for column in sf.restful('sobjects/%(streamObject)s/describe/' %
                                       file_format)['fields']]
            query = 'SELECT %s FROM %s LIMIT 4' % (', '.join(
                [col['name']
                 for col in table_metadata]), file_format['streamObject'])
            print(query)

            try:
                records = sf.query_all(query)
            except SalesforceRefusedRequest as e:
                raise PopupException(message=str(e))

            format_ = {
                "sample":
                [list(row.values())[1:] for row in records['records']],
                "columns": [
                    Field(
                        col['name'],
                        HiveFormat.FIELD_TYPE_TRANSLATE.get(
                            col['type'], 'string')).to_dict()
                    for col in table_metadata
                ]
            }
        else:
            raise PopupException(
                _('Connector format not recognized: %(connectorSelection)s') %
                file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    return JsonResponse(format_)
Пример #25
0
def _small_indexing(user, fs, client, source, destination, index_name):
    unique_key_field = destination['indexerPrimaryKey'] and destination[
        'indexerPrimaryKey'][0] or None
    df = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None
    kwargs = {}
    errors = []

    if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
        stats = fs.stats(source["path"])
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            config_name=destination.get('indexerConfigSet'),
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])

    if source['inputFormat'] == 'file':
        data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE)

    if client.is_solr_six_or_more():
        kwargs['processor'] = 'tolerant'

    try:
        if source['inputFormat'] == 'query':
            query_id = source['query']['id'] if source['query'].get(
                'id') else source['query']

            notebook = Notebook(document=Document2.objects.document(
                user=user, doc_id=query_id)).get_data()
            request = MockedDjangoRequest(user=user)
            snippet = notebook['snippets'][0]

            searcher = CollectionManagerController(user)
            columns = [
                field['name'] for field in fields if field['name'] != 'hue_id'
            ]
            fetch_handle = lambda rows, start_over: get_api(
                request, snippet).fetch_result(
                    notebook, snippet, rows=rows, start_over=start_over
                )  # Assumes handle still live
            rows = searcher.update_data_from_hive(index_name,
                                                  columns,
                                                  fetch_handle=fetch_handle,
                                                  indexing_options=kwargs)
            # TODO if rows == MAX_ROWS truncation warning
        else:
            response = client.index(name=index_name, data=data, **kwargs)
            errors = [
                error.get('message', '')
                for error in response['responseHeader'].get('errors', [])
            ]
    except Exception, e:
        try:
            client.delete_index(index_name, keep_config=False)
        except Exception, e2:
            LOG.warn(
                'Error while cleaning-up config of failed collection creation %s: %s'
                % (index_name, e2))
Пример #26
0
Файл: api3.py Проект: mapr/hue
def guess_format(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))
    file_type = file_format['file_type']
    path = urllib_unquote(file_format["path"])

    if sys.version_info[0] < 3 and (file_type == 'excel' or path[-3:] == 'xls'
                                    or path[-4:] == 'xlsx'):
        return JsonResponse({
            'status':
            -1,
            'message':
            'Python2 based Hue does not support Excel file importer'
        })

    if file_format['inputFormat'] == 'localfile':
        if file_type == 'excel':
            format_ = {"type": "excel", "hasHeader": True}
        else:
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "csv",
                "hasHeader": True,
                "fieldSeparator": ","
            }

    elif file_format['inputFormat'] == 'file':
        if path[-3:] == 'xls' or path[-4:] == 'xlsx':
            file_obj = request.fs.open(path)
            if path[-3:] == 'xls':
                df = pd.read_excel(file_obj.read(1024 * 1024 * 1024),
                                   engine='xlrd')
            else:
                df = pd.read_excel(file_obj.read(1024 * 1024 * 1024),
                                   engine='openpyxl')
            _csv_data = df.to_csv(index=False)

            path = excel_to_csv_file_name_change(path)
            request.fs.create(path, overwrite=True, data=_csv_data)

        indexer = MorphlineIndexer(request.user, request.fs)
        if not request.fs.isfile(path):
            raise PopupException(
                _('Path %(path)s is not a file') % file_format)

        stream = request.fs.open(path)
        format_ = indexer.guess_format(
            {"file": {
                "stream": stream,
                "name": path
            }})
        _convert_format(format_)

        if file_format["path"][-3:] == 'xls' or file_format["path"][
                -4:] == 'xlsx':
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "excel",
                "hasHeader": True,
                "fieldSeparator": ","
            }

    elif file_format['inputFormat'] == 'table':
        db = dbms.get(request.user)
        try:
            table_metadata = db.get_table(database=file_format['databaseName'],
                                          table_name=file_format['tableName'])
        except Exception as e:
            raise PopupException(
                e.message if hasattr(e, 'message') and e.message else e)
        storage = {}
        for delim in table_metadata.storage_details:
            if delim['data_type']:
                if '=' in delim['data_type']:
                    key, val = delim['data_type'].split('=', 1)
                    storage[key] = val
                else:
                    storage[delim['data_type']] = delim['comment']
        if table_metadata.details['properties']['format'] == 'text':
            format_ = {
                "quoteChar": "\"",
                "recordSeparator": '\\n',
                "type": "csv",
                "hasHeader": False,
                "fieldSeparator": storage.get('field.delim', ',')
            }
        elif table_metadata.details['properties']['format'] == 'parquet':
            format_ = {
                "type": "parquet",
                "hasHeader": False,
            }
        else:
            raise PopupException(
                'Hive table format %s is not supported.' %
                table_metadata.details['properties']['format'])
    elif file_format['inputFormat'] == 'query':
        format_ = {
            "quoteChar": "\"",
            "recordSeparator": "\\n",
            "type": "csv",
            "hasHeader": False,
            "fieldSeparator": "\u0001"
        }
    elif file_format['inputFormat'] == 'rdbms':
        format_ = {"type": "csv"}
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            format_ = {
                "type": "json",
                # "fieldSeparator": ",",
                # "hasHeader": True,
                # "quoteChar": "\"",
                # "recordSeparator": "\\n",
                'topics': get_topics(request.user)
            }
        elif file_format['streamSelection'] == 'flume':
            format_ = {
                "type": "csv",
                "fieldSeparator": ",",
                "hasHeader": True,
                "quoteChar": "\"",
                "recordSeparator": "\\n"
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            format_ = {
                "type":
                "csv",
                "fieldSeparator":
                ",",
                "hasHeader":
                True,
                "quoteChar":
                "\"",
                "recordSeparator":
                "\\n",
                'objects': [
                    sobject['name']
                    for sobject in sf.restful('sobjects/')['sobjects']
                    if sobject['queryable']
                ]
            }
        else:
            raise PopupException(
                _('Input format %(inputFormat)s connector not recognized: $(connectorSelection)s'
                  ) % file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    format_['status'] = 0
    return JsonResponse(format_)