def generate_morphline_config(self, collection_name, data, uuid_name=None): """ Input: data: { 'type': {'name': 'My New Collection!' format': 'csv', 'columns': [{'name': business_id, 'included': True', 'type': 'string'}, cool, date], fieldSeparator : ",", recordSeparator: '\n', quoteChar : "\""}, 'transformation': [ 'country_code': {'replace': {'FRA': 'FR, 'CAN': 'CA'..}} 'ip': {'geoIP': } ] } Output: Morphline content 'SOLR_LOCATOR : { ...}' """ geolite_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "GeoLite2-City.mmdb") grok_dicts_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "grok_dictionaries") properties = { "collection_name": collection_name, "fields": self.get_field_list(data['columns']), "num_base_fields": len(data['columns']), "uuid_name": uuid_name, "get_regex": Indexer._get_regex_for_type, "format_settings": data['format'], "format_class": get_file_format_class(data['format']['type']), "get_kept_args": get_checked_args, "grok_dictionaries_location": grok_dicts_loc if self.fs and self.fs.exists(grok_dicts_loc) else None, "geolite_db_location": geolite_loc if self.fs and self.fs.exists(geolite_loc) else None, "zk_host": zkensemble() } oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get() lookup = TemplateLookup(directories=[oozie_workspace]) morphline = lookup.get_template("morphline_template.conf").render( **properties) return morphline
def generate_morphline_config(self, collection_name, data, uuid_name): """ Input: data: { 'type': {'name': 'My New Collection!' format': 'csv', 'columns': [{'name': business_id, 'included': True', 'type': 'string'}, cool, date], fieldSeparator : ",", recordSeparator: '\n', quoteChar : "\""}, 'transformation': [ 'country_code': {'replace': {'FRA': 'FR, 'CAN': 'CA'..}} 'ip': {'geoIP': } ] } Output: Morphline content 'SOLR_LOCATOR : { ...}' """ properties = { "collection_name":collection_name, "fields":self.get_field_list(data['columns']), "num_base_fields": len(data['columns']), "format_character":Indexer._format_character, "uuid_name" : uuid_name, "get_regex":Indexer._get_regex_for_type, "format":data['format'], "grok_dictionaries_location" : os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "grok_dictionaries"), "zk_host": zkensemble() } oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get() lookup = TemplateLookup(directories=[oozie_workspace]) morphline = lookup.get_template("morphline_template.conf").render(**properties) return morphline
def generate_morphline_config(self, collection_name, data, uuid_name=None, lib_path=None): if lib_path is None: lib_path = CONFIG_INDEXER_LIBS_PATH.get() geolite_loc = os.path.join(lib_path, "GeoLite2-City.mmdb") grok_dicts_loc = os.path.join(lib_path, "grok_dictionaries") properties = { "collection_name": collection_name, "fields": self.get_field_list(data['columns'], is_converting_types=True), "num_base_fields": len(data['columns']), "uuid_name" : uuid_name, "get_regex": MorphlineIndexer._get_regex_for_type, "format_settings": data['format'], "format_class": get_file_format_class(data['format']['type']), "get_kept_args": get_checked_args, "grok_dictionaries_location" : grok_dicts_loc if self.fs and self.fs.exists(grok_dicts_loc) else None, "geolite_db_location" : geolite_loc if self.fs and self.fs.exists(geolite_loc) else None, "zk_host": self.solr_client.get_zookeeper_host() ## offline test? } oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get() lookup = TemplateLookup(directories=[oozie_workspace]) morphline = lookup.get_template("morphline_template.conf").render(**properties) return morphline
def run_morphline(self, request, collection_name, morphline, input_path, query=None): workspace_path = self._upload_workspace(morphline) notebook = Notebook(name='Indexer job for %s' % collection_name, isManaged=True) if query: q = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=query)) notebook_data = q.get_data() snippet = notebook_data['snippets'][0] api = get_api(request, snippet) destination = '__hue_%s' % notebook_data['uuid'][:4] location = '/user/%s/__hue-%s' % (request.user, notebook_data['uuid'][:4]) sql, success_url = api.export_data_as_table(notebook_data, snippet, destination, is_temporary=True, location=location) input_path = '${nameNode}%s' % location notebook.add_hive_snippet(snippet['database'], sql) notebook.add_java_snippet( clazz='org.apache.solr.hadoop.MapReduceIndexerTool', app_jar=CONFIG_INDEXER_LIBS_PATH.get(), arguments=[ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', zkensemble(), u'--collection', collection_name, input_path, ], files=[{ u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file' }, { u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file' }]) return notebook.execute(request, batch=True)
def run_morphline(self, request, collection_name, morphline, input_path, query=None, start_time=None, lib_path=None): workspace_path = self._upload_workspace(morphline) task = make_notebook( name=_('Indexing into %s') % collection_name, editor_type='notebook', on_success_url=reverse('search:browse', kwargs={'name': collection_name}), pub_sub_url='assist.collections.refresh', is_task=True, is_notebook=True, last_executed=start_time ) if query: q = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=query)) notebook_data = q.get_data() snippet = notebook_data['snippets'][0] api = get_api(request, snippet) destination = '__hue_%s' % notebook_data['uuid'][:4] location = '/user/%s/__hue-%s' % (request.user, notebook_data['uuid'][:4]) sql, _success_url = api.export_data_as_table(notebook_data, snippet, destination, is_temporary=True, location=location) input_path = '${nameNode}%s' % location task.add_hive_snippet(snippet['database'], sql) client = SolrClient(self.user) extra_args = ['-Dmapreduce.job.user.classpath.first=true'] if client.is_solr_six_or_more() else [] task.add_java_snippet( clazz='org.apache.solr.hadoop.MapReduceIndexerTool', app_jar=lib_path if lib_path is not None else CONFIG_INDEXER_LIBS_PATH.get(), arguments=extra_args + [ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', client.get_zookeeper_host(), u'--collection', collection_name, input_path, ], files=[ {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'}, {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'} ] ) return task.execute(request, batch=True)
def run_morphline(self, collection_name, morphline, input_path): workspace_path = self._upload_workspace(morphline) snippet_properties = { u'files': [{ u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file' }, { u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file' }], u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool', u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(), u'arguments': [ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', zkensemble(), u'--collection', collection_name, u'${nameNode}%s' % input_path, ], u'archives': [], } notebook = make_notebook( name='Indexer', editor_type='java', snippet_properties=snippet_properties).get_data() notebook_doc, created = _save_notebook(notebook, self.user) workflow_doc = WorkflowBuilder().create_workflow( document=notebook_doc, user=self.user, managed=True, name=_("Batch job for %s") % notebook_doc.name) workflow = Workflow(document=workflow_doc, user=self.user) job_id = _submit_workflow(user=self.user, fs=self.fs, jt=self.jt, workflow=workflow, mapping=None) return job_id
def generate_morphline_config(self, collection_name, data, uuid_name=None): geolite_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "GeoLite2-City.mmdb") grok_dicts_loc = os.path.join(CONFIG_INDEXER_LIBS_PATH.get(), "grok_dictionaries") properties = { "collection_name": collection_name, "fields": self.get_field_list(data['columns']), "num_base_fields": len(data['columns']), "uuid_name" : uuid_name, "get_regex": Indexer._get_regex_for_type, "format_settings": data['format'], "format_class": get_file_format_class(data['format']['type']), "get_kept_args": get_checked_args, "grok_dictionaries_location" : grok_dicts_loc if self.fs and self.fs.exists(grok_dicts_loc) else None, "geolite_db_location" : geolite_loc if self.fs and self.fs.exists(geolite_loc) else None, "zk_host": zkensemble() } oozie_workspace = CONFIG_INDEXING_TEMPLATES_PATH.get() lookup = TemplateLookup(directories=[oozie_workspace]) morphline = lookup.get_template("morphline_template.conf").render(**properties) return morphline
def run_morphline(self, request, collection_name, morphline, input_path, query=None): workspace_path = self._upload_workspace(morphline) notebook = Notebook( name='Indexer job for %s' % collection_name, isManaged=True ) if query: q = Notebook(document=Document2.objects.get_by_uuid(user=self.user, uuid=query)) notebook_data = q.get_data() snippet = notebook_data['snippets'][0] api = get_api(request, snippet) destination = '__hue_%s' % notebook_data['uuid'][:4] location = '/user/%s/__hue-%s' % (request.user, notebook_data['uuid'][:4]) sql, success_url = api.export_data_as_table(notebook_data, snippet, destination, is_temporary=True, location=location) input_path = '${nameNode}%s' % location notebook.add_hive_snippet(snippet['database'], sql) notebook.add_java_snippet( clazz='org.apache.solr.hadoop.MapReduceIndexerTool', app_jar=CONFIG_INDEXER_LIBS_PATH.get(), arguments=[ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', zkensemble(), u'--collection', collection_name, input_path, ], files=[ {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'}, {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'} ] ) return notebook.execute(request, batch=True)
def _schedule_oozie_job(self, workspace_path, collection_name, input_path): oozie = get_oozie(self.username) properties = { "dryrun": "False", "zkHost": zkensemble(), # these libs can be installed from here: # https://drive.google.com/a/cloudera.com/folderview?id=0B1gZoK8Ae1xXc0sxSkpENWJ3WUU&usp=sharing "oozie.libpath": CONFIG_INDEXER_LIBS_PATH.get(), "security_enabled": "False", "collectionName": collection_name, "filePath": input_path, "outputDir": "/user/%s/indexer" % self.username, "workspacePath": workspace_path, 'oozie.wf.application.path': "${nameNode}%s" % workspace_path, 'user.name': self.username } submission = Submission(self.username, fs=self.fs, properties=properties) job_id = submission.run(workspace_path) return job_id
def run_morphline(self, collection_name, morphline, input_path): workspace_path = self._upload_workspace(morphline) snippet_properties = { u'files': [ {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'}, {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'} ], u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool', u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(), u'arguments': [ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', zkensemble(), u'--collection', collection_name, u'${nameNode}%s' % input_path, ], u'archives': [], } notebook = make_notebook(name='Indexer', editor_type='java', snippet_properties=snippet_properties).get_data() notebook_doc, created = _save_notebook(notebook, self.user) workflow_doc = WorkflowBuilder().create_workflow(document=notebook_doc, user=self.user, managed=True, name=_("Batch job for %s") % notebook_doc.name) workflow = Workflow(document=workflow_doc, user=self.user) job_id = _submit_workflow(user=self.user, fs=self.fs, jt=self.jt, workflow=workflow, mapping=None) return job_id
def run_morphline(self, request, collection_name, morphline, input_path): workspace_path = self._upload_workspace(morphline) # snippets = [ # { # u'type': u'java', # u'files': [ # {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'}, # {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'} # ], # u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool', # u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(), # u'arguments': [ # u'--morphline-file', # u'morphline.conf', # u'--output-dir', # u'${nameNode}/user/%s/indexer' % self.username, # u'--log4j', # u'log4j.properties', # u'--go-live', # u'--zk-host', # zkensemble(), # u'--collection', # collection_name, # input_path, # ], # u'archives': [], # } # ] # # # managed notebook # notebook = make_notebook2(name='Indexer job for %s' % collection_name, snippets=snippets).get_data() # notebook_doc, created = _save_notebook(notebook, self.user) # # snippet = {'wasBatchExecuted': True} snippet_properties = { u'files': [ {u'path': u'%s/log4j.properties' % workspace_path, u'type': u'file'}, {u'path': u'%s/morphline.conf' % workspace_path, u'type': u'file'} ], u'class': u'org.apache.solr.hadoop.MapReduceIndexerTool', u'app_jar': CONFIG_INDEXER_LIBS_PATH.get(), u'arguments': [ u'--morphline-file', u'morphline.conf', u'--output-dir', u'${nameNode}/user/%s/indexer' % self.username, u'--log4j', u'log4j.properties', u'--go-live', u'--zk-host', zkensemble(), u'--collection', collection_name, input_path, ], u'archives': [], } notebook = make_notebook(name='Indexer', editor_type='java', snippet_properties=snippet_properties, status='running').get_data() notebook_doc, created = _save_notebook(notebook, self.user) snippet = {'wasBatchExecuted': True, 'id': notebook['snippets'][0]['id'], 'statement': ''} job_handle = _execute_notebook(request, notebook, snippet) return job_handle