Python FlumeIndexer примеры использования

Язык программирования: Python

Пространство имен/Пакет: indexer.indexers.flume

Класс/Тип: FlumeIndexer

Примеров на hotexamples.com: 2

Python FlumeIndexer - 2 примера найдено. Это лучшие примеры Python кода для indexer.indexers.flume.FlumeIndexer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

FlumeIndexer(2)

Основные методы

FlumeIndexer (2)

Пример #1

Показать файл

def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None, destination=None):
  indexer = MorphlineIndexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  client = SolrClient(user=request.user)

  if not client.exists(collection_name) and not request.POST.get('show_command'): # if destination['isTargetExisting']:
    client.create_index(
      name=collection_name,
      fields=request.POST.get('fields', schema_fields),
      unique_key_field=unique_field
      # No df currently
    )
  else:
    # TODO: check if format matches
    pass

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'stream' and file_format['streamSelection'] == 'flume':
    indexer = FlumeIndexer(user=request.user)
    if request.POST.get('show_command'):
      configs = indexer.generate_config(file_format, destination)
      return {'status': 0, 'commands': configs[-1]}
    else:
      return indexer.start(collection_name, file_format, destination)
  elif file_format['inputFormat'] == 'stream':
    return _envelope_job(request, file_format, destination, start_time=start_time, lib_path=lib_path)
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % urllib_unquote(file_format["path"])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path)

  return indexer.run_morphline(
      request,
      collection_name,
      morphline,
      input_path,
      query,
      start_time=start_time,
      lib_path=lib_path
  )

Пример #2

Показать файл

Файл: flume_tests.py Проект: ziq211/hue

def test_generate_from_directory_to_solr_index():
    raise SkipTest

    source = {
        'channelSourceType': 'directory',
    }
    destination = {
        'ouputFormat': 'index',
    }

    configs = FlumeIndexer(user=None).generate_config(source=source,
                                                      destination=destination)

    assert_equal(
        '''SOLR_LOCATOR : {
    # Name of solr collection
    collection : log_analytics_demo
    # ZooKeeper ensemble
    zkHost : "spark2-envelope515-1.gce.cloudera.com:2181/solr"
}


morphlines : [
{
    id : hue_accesslogs_no_geo

    importCommands : ["org.kitesdk.**", "org.apache.solr.**"]
    commands : [
    {
        ## Read the email stream and break it up into individual messages.
        ## The beginning of a message is marked by regex clause below
        ## The reason we use this command is that one event can have multiple
        ## messages
        readCSV {

        ## Hue HTTPD load balancer
        ## 172.18.18.3 - - [27/Aug/2018:05:47:12 -0700] "GET /static/desktop/js/jquery.rowselector.a04240f7cc48.js HTTP/1.1" 200 2321

      separator:  " "
            columns:  [client_ip,C1,C2,time,dummy1,request,code,bytes]
      ignoreFirstLine : false
            quoteChar : "\""
            commentPrefix : ""
            trim : true
            charset : UTF-8
        }
    }
    {
  split {
    inputField : request
    outputFields : [method, url, protocol]
    separator : " "
    isRegex : false
    #separator : """\s*,\s*"""
    #  #isRegex : true
    addEmptyStrings : false
    trim : true
          }
    }
     {
  split {
    inputField : url
    outputFields : ["", app, subapp]
    separator : "\/"
    isRegex : false
    #separator : """\s*,\s*"""
    #  #isRegex : true
    addEmptyStrings : false
    trim : true
          }
    }
    {
  userAgent {
    inputField : user_agent
    outputFields : {
      user_agent_family : "@{ua_family}"
      user_agent_major  : "@{ua_major}"
      device_family     : "@{device_family}"
      os_family         : "@{os_family}"
      os_major    : "@{os_major}"
    }
  }
    }

      #{logInfo { format : "BODY : {}", args : ["@{}"] } }
    # add Unique ID, in case our message_id field from above is not present
    {
        generateUUID {
            field:id
        }
    }

    # convert the timestamp field to "yyyy-MM-dd'T'HH:mm:ss.SSSZ" format
    {
       #  21/Nov/2014:22:08:27
        convertTimestamp {
            field : time
            inputFormats : ["[dd/MMM/yyyy:HH:mm:ss", "EEE, d MMM yyyy HH:mm:ss Z", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd"]
            #inputTimezone : America/Los_Angeles
            inputTimezone : UTC
            outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"
            outputTimezone : UTC
        }
    }

    # Consume the output record of the previous command and pipe another
    # record downstream.
    #
    # This command sanitizes record fields that are unknown to Solr schema.xml
    # by deleting them. Recall that Solr throws an exception on any attempt to
    # load a document that contains a field that isn't specified in schema.xml
    {
        sanitizeUnknownSolrFields {
            # Location from which to fetch Solr schema
            solrLocator : ${SOLR_LOCATOR}
        }
    }

    # load the record into a SolrServer or MapReduce SolrOutputFormat.
    {
        loadSolr {
            solrLocator : ${SOLR_LOCATOR}
        }
    }
    ]
}
]
'''.strip(),
        configs[0][1].strip()  # 'agent_morphlines_conf_file'
    )

    assert_equal((
        'agent_config_file',
        'tier1.sources = source1\n  tier1.channels = channel1\n  tier1.sinks = sink1\n\n\n  tier1.channels.channel1.type = memory\n  tier1.channels.channel1.capacity = 10000\n  tier1.channels.channel1.transactionCapacity = 1000\n\n  \n  tier1.sinks.sink1.type          = org.apache.flume.sink.solr.morphline.MorphlineSolrSink\n  tier1.sinks.sink1.morphlineFile = morphlines.conf\n  tier1.sinks.sink1.morphlineId = hue_accesslogs_no_geo\n  tier1.sinks.sink1.channel       = channel1'
    ), configs['agent_config_file'])