示例#1
0
def _create_index(user, fs, client, source, destination, index_name):
    unique_key_field = destination['indexerPrimaryKey'] and destination[
        'indexerPrimaryKey'][0] or None
    df = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None
    kwargs = {}

    if source['inputFormat'] not in ('manual', 'table'):
        stats = fs.stats(source["path"])
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            config_name=destination.get('indexerConfigSet'),
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])

    if source['inputFormat'] not in ('manual', 'table'):
        data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE)
        try:
            client.index(name=index_name, data=data, **kwargs)
        except Exception, e:
            try:
                client.delete_index(index_name, keep_config=False)
            except Exception, e2:
                LOG.warn(
                    'Error while cleaning-up config of failed collection creation %s: %s'
                    % (index_name, e2))
            raise e
示例#2
0
def _create_index(user, fs, client, source, destination, index_name):
    unique_key_field = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None
    df = destination['indexerPrimaryKey'] and destination['indexerPrimaryKey'][
        0] or None
    kwargs = {}

    if source['inputFormat'] != 'manual':
        stats = fs.stats(source["path"])
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])

    if source['inputFormat'] != 'manual':
        data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE)
        client.index(name=index_name, data=data, **kwargs)

    return {
        'status': 0,
        'on_success_url': reverse('indexer:indexes',
                                  kwargs={'index': index_name}),
        'pub_sub_url': 'assist.collections.refresh'
    }
示例#3
0
def _small_indexing(user, fs, client, source, destination, index_name):
  kwargs = {}
  errors = []

  if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
    path = urllib.unquote(source["path"])
    stats = fs.stats(path)
    if stats.size > MAX_UPLOAD_SIZE:
      raise PopupException(_('File size is too large to handle!'))

  indexer = MorphlineIndexer(user, fs)

  fields = indexer.get_field_list(destination['columns'])
  _create_solr_collection(user, fs, client, destination, index_name, kwargs)

  if source['inputFormat'] == 'file':
    path = urllib.unquote(source["path"])
    data = fs.read(path, 0, MAX_UPLOAD_SIZE)

  if client.is_solr_six_or_more():
    kwargs['processor'] = 'tolerant'
    kwargs['map'] = 'NULL:'

  try:
    if source['inputFormat'] == 'query':
      query_id = source['query']['id'] if source['query'].get('id') else source['query']

      notebook = Notebook(document=Document2.objects.document(user=user, doc_id=query_id)).get_data()
      request = MockedDjangoRequest(user=user)
      snippet = notebook['snippets'][0]

      searcher = CollectionManagerController(user)
      columns = [field['name'] for field in fields if field['name'] != 'hue_id']
      fetch_handle = lambda rows, start_over: get_api(request, snippet).fetch_result(notebook, snippet, rows=rows, start_over=start_over) # Assumes handle still live
      rows = searcher.update_data_from_hive(index_name, columns, fetch_handle=fetch_handle, indexing_options=kwargs)
      # TODO if rows == MAX_ROWS truncation warning
    elif source['inputFormat'] == 'manual':
      pass # No need to do anything
    else:
      response = client.index(name=index_name, data=data, **kwargs)
      errors = [error.get('message', '') for error in response['responseHeader'].get('errors', [])]
  except Exception, e:
    try:
      client.delete_index(index_name, keep_config=False)
    except Exception, e2:
      LOG.warn('Error while cleaning-up config of failed collection creation %s: %s' % (index_name, e2))
示例#4
0
文件: api3.py 项目: zhengwei2020/hue
def _create_solr_collection(user, fs, client, destination, index_name, kwargs):
    unique_key_field = destination['indexerPrimaryKey'] and destination[
        'indexerPrimaryKey'][0] or None
    df = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    for field in fields:
        for operation in field['operations']:
            if operation['type'] == 'split':
                field[
                    'multiValued'] = True  # Solr requires multiValued to be set when splitting
                kwargs['f.%(name)s.split' % field] = 'true'
                kwargs['f.%(name)s.separator' %
                       field] = operation['settings']['splitChar'] or ','

    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            config_name=destination.get('indexerConfigSet'),
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])
示例#5
0
def _small_indexing(user, fs, client, source, destination, index_name):
    unique_key_field = destination['indexerPrimaryKey'] and destination[
        'indexerPrimaryKey'][0] or None
    df = destination['indexerDefaultField'] and destination[
        'indexerDefaultField'][0] or None
    kwargs = {}
    errors = []

    if source['inputFormat'] not in ('manual', 'table', 'query_handle'):
        stats = fs.stats(source["path"])
        if stats.size > MAX_UPLOAD_SIZE:
            raise PopupException(_('File size is too large to handle!'))

    indexer = MorphlineIndexer(user, fs)
    fields = indexer.get_field_list(destination['columns'])
    skip_fields = [field['name'] for field in fields if not field['keep']]

    kwargs['fieldnames'] = ','.join([field['name'] for field in fields])
    if skip_fields:
        kwargs['skip'] = ','.join(skip_fields)
        fields = [
            field for field in fields if field['name'] not in skip_fields
        ]

    if not unique_key_field:
        unique_key_field = 'hue_id'
        fields += [{"name": unique_key_field, "type": "string"}]
        kwargs['rowid'] = unique_key_field

    if not destination['hasHeader']:
        kwargs['header'] = 'false'
    else:
        kwargs['skipLines'] = 1

    if not client.exists(index_name):
        client.create_index(
            name=index_name,
            config_name=destination.get('indexerConfigSet'),
            fields=fields,
            unique_key_field=unique_key_field,
            df=df,
            shards=destination['indexerNumShards'],
            replication=destination['indexerReplicationFactor'])

    if source['inputFormat'] == 'file':
        data = fs.read(source["path"], 0, MAX_UPLOAD_SIZE)

    if client.is_solr_six_or_more():
        kwargs['processor'] = 'tolerant'

    try:
        if source['inputFormat'] == 'query':
            query_id = source['query']['id'] if source['query'].get(
                'id') else source['query']

            notebook = Notebook(document=Document2.objects.document(
                user=user, doc_id=query_id)).get_data()
            request = MockedDjangoRequest(user=user)
            snippet = notebook['snippets'][0]

            searcher = CollectionManagerController(user)
            columns = [
                field['name'] for field in fields if field['name'] != 'hue_id'
            ]
            fetch_handle = lambda rows, start_over: get_api(
                request, snippet).fetch_result(
                    notebook, snippet, rows=rows, start_over=start_over
                )  # Assumes handle still live
            rows = searcher.update_data_from_hive(index_name,
                                                  columns,
                                                  fetch_handle=fetch_handle,
                                                  indexing_options=kwargs)
            # TODO if rows == MAX_ROWS truncation warning
        else:
            response = client.index(name=index_name, data=data, **kwargs)
            errors = [
                error.get('message', '')
                for error in response['responseHeader'].get('errors', [])
            ]
    except Exception, e:
        try:
            client.delete_index(index_name, keep_config=False)
        except Exception, e2:
            LOG.warn(
                'Error while cleaning-up config of failed collection creation %s: %s'
                % (index_name, e2))