def update_data_from_hive(self, db, collection_or_core_name, database, table, columns, indexing_strategy='upload'): """ Add hdfs path contents to index """ # Run a custom hive query and post data to collection from beeswax.server import dbms import tablib api = SolrApi(SOLR_URL.get(), self.user, SECURITY_ENABLED.get()) if indexing_strategy == 'upload': table = db.get_table(database, table) hql = "SELECT %s FROM `%s.%s` %s" % (','.join(columns), database, table.name, db._get_browse_limit_clause(table)) query = dbms.hql_query(hql) try: handle = db.execute_and_wait(query) if handle: result = db.fetch(handle, rows=100) db.close(handle) dataset = tablib.Dataset() dataset.append(columns) for row in result.rows(): dataset.append(row) if not api.update(collection_or_core_name, dataset.csv, content_type='csv'): raise PopupException(_('Could not update index. Check error logs for more info.')) else: raise PopupException(_('Could not update index. Could not fetch any data from Hive.')) except Exception, e: raise PopupException(_('Could not update index.'), detail=e)
def update_data_from_hdfs(self, fs, collection_or_core_name, fields, path, data_type='separated', indexing_strategy='upload', **kwargs): """ Add hdfs path contents to index """ api = SolrApi(SOLR_URL.get(), self.user, SECURITY_ENABLED.get()) if indexing_strategy == 'upload': stats = fs.stats(path) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) else: # Get fields for filtering unique_key, fields = self.get_fields(collection_or_core_name) fields = [{'name': field, 'type': fields[field]['type']} for field in fields] fh = fs.open(path) if data_type == 'log': # Transform to JSON then update data = json.dumps([value for value in field_values_from_log(fh, fields)]) content_type = 'json' elif data_type == 'separated': data = json.dumps([value for value in field_values_from_separated_file(fh, kwargs.get('separator', ','), kwargs.get('quote_character', '"'), fields)], indent=2) content_type = 'json' else: raise PopupException(_('Could not update index. Unknown type %s') % data_type) fh.close() if not api.update(collection_or_core_name, data, content_type=content_type): raise PopupException(_('Could not update index. Check error logs for more info.')) else: raise PopupException(_('Could not update index. Indexing strategy %s not supported.') % indexing_strategy)
def update_data_from_hive(self, db, collection_or_core_name, database, table, columns, indexing_strategy='upload'): """ Add hdfs path contents to index """ # Run a custom hive query and post data to collection from beeswax.server import dbms import tablib api = SolrApi(SOLR_URL.get(), self.user, SECURITY_ENABLED.get()) if indexing_strategy == 'upload': table = db.get_table(database, table) hql = "SELECT %s FROM `%s.%s` %s" % (','.join(columns), database, table.name, db._get_browse_limit_clause(table)) query = dbms.hql_query(hql) handle = db.execute_and_wait(query) if handle: result = db.fetch(handle, rows=100) db.close(handle) dataset = tablib.Dataset() dataset.append(columns) for row in result.rows(): dataset.append(row) if not api.update(collection_or_core_name, dataset.csv, content_type='csv'): raise PopupException(_('Could not update index. Check error logs for more info.')) else: raise PopupException(_('Could not update index. Could not fetch any data from Hive.')) else: raise PopupException(_('Could not update index. Indexing strategy %s not supported.') % indexing_strategy)
def update_data_from_hive(self, collection_or_core_name, columns, fetch_handle): MAX_ROWS = 10000 ROW_COUNT = 0 FETCH_BATCH = 1000 has_more = True api = SolrApi(SOLR_URL.get(), self.user, SECURITY_ENABLED.get()) try: while ROW_COUNT < MAX_ROWS and has_more: result = fetch_handle(FETCH_BATCH, ROW_COUNT == 0) has_more = result['has_more'] if result['data']: dataset = tablib.Dataset() dataset.append(columns) for i, row in enumerate(result['data']): dataset.append([ROW_COUNT + i] + [ cell if cell else (0 if isinstance(cell, numbers.Number) else '') for cell in row ]) if not api.update(collection_or_core_name, dataset.csv, content_type='csv'): raise PopupException( _('Could not update index. Check error logs for more info.' )) ROW_COUNT += len(dataset) except Exception, e: raise PopupException(_('Could not update index.'), detail=e)