Пример #1
0
def _large_indexing(request, file_format, collection_name, query=None, start_time=None, lib_path=None, destination=None):
  indexer = MorphlineIndexer(request.user, request.fs)

  unique_field = indexer.get_unique_field(file_format)
  is_unique_generated = indexer.is_unique_generated(file_format)

  schema_fields = indexer.get_kept_field_list(file_format['columns'])
  if is_unique_generated:
    schema_fields += [{"name": unique_field, "type": "string"}]

  client = SolrClient(user=request.user)

  if not client.exists(collection_name) and not request.POST.get('show_command'): # if destination['isTargetExisting']:
    client.create_index(
      name=collection_name,
      fields=request.POST.get('fields', schema_fields),
      unique_key_field=unique_field
      # No df currently
    )
  else:
    # TODO: check if format matches
    pass

  if file_format['inputFormat'] == 'table':
    db = dbms.get(request.user)
    table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName'])
    input_path = table_metadata.path_location
  elif file_format['inputFormat'] == 'stream' and file_format['streamSelection'] == 'flume':
    indexer = FlumeIndexer(user=request.user)
    if request.POST.get('show_command'):
      configs = indexer.generate_config(file_format, destination)
      return {'status': 0, 'commands': configs[-1]}
    else:
      return indexer.start(collection_name, file_format, destination)
  elif file_format['inputFormat'] == 'stream':
    return _envelope_job(request, file_format, destination, start_time=start_time, lib_path=lib_path)
  elif file_format['inputFormat'] == 'file':
    input_path = '${nameNode}%s' % urllib_unquote(file_format["path"])
  else:
    input_path = None

  morphline = indexer.generate_morphline_config(collection_name, file_format, unique_field, lib_path=lib_path)

  return indexer.run_morphline(
      request,
      collection_name,
      morphline,
      input_path,
      query,
      start_time=start_time,
      lib_path=lib_path
  )
Пример #2
0
def test_generate_from_directory_to_solr_index():
    raise SkipTest

    source = {
        'channelSourceType': 'directory',
    }
    destination = {
        'ouputFormat': 'index',
    }

    configs = FlumeIndexer(user=None).generate_config(source=source,
                                                      destination=destination)

    assert_equal(
        '''SOLR_LOCATOR : {
    # Name of solr collection
    collection : log_analytics_demo
    # ZooKeeper ensemble
    zkHost : "spark2-envelope515-1.gce.cloudera.com:2181/solr"
}


morphlines : [
{
    id : hue_accesslogs_no_geo

    importCommands : ["org.kitesdk.**", "org.apache.solr.**"]
    commands : [
    {
        ## Read the email stream and break it up into individual messages.
        ## The beginning of a message is marked by regex clause below
        ## The reason we use this command is that one event can have multiple
        ## messages
        readCSV {

        ## Hue HTTPD load balancer
        ## 172.18.18.3 - - [27/Aug/2018:05:47:12 -0700] "GET /static/desktop/js/jquery.rowselector.a04240f7cc48.js HTTP/1.1" 200 2321

      separator:  " "
            columns:  [client_ip,C1,C2,time,dummy1,request,code,bytes]
      ignoreFirstLine : false
            quoteChar : "\""
            commentPrefix : ""
            trim : true
            charset : UTF-8
        }
    }
    {
  split {
    inputField : request
    outputFields : [method, url, protocol]
    separator : " "
    isRegex : false
    #separator : """\s*,\s*"""
    #  #isRegex : true
    addEmptyStrings : false
    trim : true
          }
    }
     {
  split {
    inputField : url
    outputFields : ["", app, subapp]
    separator : "\/"
    isRegex : false
    #separator : """\s*,\s*"""
    #  #isRegex : true
    addEmptyStrings : false
    trim : true
          }
    }
    {
  userAgent {
    inputField : user_agent
    outputFields : {
      user_agent_family : "@{ua_family}"
      user_agent_major  : "@{ua_major}"
      device_family     : "@{device_family}"
      os_family         : "@{os_family}"
      os_major    : "@{os_major}"
    }
  }
    }

      #{logInfo { format : "BODY : {}", args : ["@{}"] } }
    # add Unique ID, in case our message_id field from above is not present
    {
        generateUUID {
            field:id
        }
    }

    # convert the timestamp field to "yyyy-MM-dd'T'HH:mm:ss.SSSZ" format
    {
       #  21/Nov/2014:22:08:27
        convertTimestamp {
            field : time
            inputFormats : ["[dd/MMM/yyyy:HH:mm:ss", "EEE, d MMM yyyy HH:mm:ss Z", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd"]
            #inputTimezone : America/Los_Angeles
            inputTimezone : UTC
            outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"
            outputTimezone : UTC
        }
    }

    # Consume the output record of the previous command and pipe another
    # record downstream.
    #
    # This command sanitizes record fields that are unknown to Solr schema.xml
    # by deleting them. Recall that Solr throws an exception on any attempt to
    # load a document that contains a field that isn't specified in schema.xml
    {
        sanitizeUnknownSolrFields {
            # Location from which to fetch Solr schema
            solrLocator : ${SOLR_LOCATOR}
        }
    }

    # load the record into a SolrServer or MapReduce SolrOutputFormat.
    {
        loadSolr {
            solrLocator : ${SOLR_LOCATOR}
        }
    }
    ]
}
]
'''.strip(),
        configs[0][1].strip()  # 'agent_morphlines_conf_file'
    )

    assert_equal((
        'agent_config_file',
        'tier1.sources = source1\n  tier1.channels = channel1\n  tier1.sinks = sink1\n\n\n  tier1.channels.channel1.type = memory\n  tier1.channels.channel1.capacity = 10000\n  tier1.channels.channel1.transactionCapacity = 1000\n\n  \n  tier1.sinks.sink1.type          = org.apache.flume.sink.solr.morphline.MorphlineSolrSink\n  tier1.sinks.sink1.morphlineFile = morphlines.conf\n  tier1.sinks.sink1.morphlineId = hue_accesslogs_no_geo\n  tier1.sinks.sink1.channel       = channel1'
    ), configs['agent_config_file'])