def test_get_ensemble(): # Non ascii data = string_io('fieldA\nrel=""nofollow"">Twitter for Péché') result = list( field_values_from_separated_file(data, delimiter='\t', quote_character='"')) assert_equal(u'rel=""nofollow"">Twitter for Péché', result[0]['fieldA']) data = string_io('fieldA\nrel=""nofollow"">Twitter for BlackBerry®') result = list( field_values_from_separated_file(data, delimiter='\t', quote_character='"')) assert_equal(u'rel=""nofollow"">Twitter for BlackBerry®', result[0]['fieldA']) # Bad binary test_str = b'fieldA\naaa\x80\x02\x03' if sys.version_info[0] > 2: data = string_io(force_unicode(test_str, errors='ignore')) else: data = string_io(test_str) result = list( field_values_from_separated_file(data, delimiter='\t', quote_character='"')) assert_equal(u'aaa\x02\x03', result[0]['fieldA'])
def test_get_ensemble(): # Non ascii data = StringIO.StringIO('fieldA\nrel=""nofollow"">Twitter for Péché') result = list( field_values_from_separated_file(data, delimiter='\t', quote_character='"')) assert_equal(u'rel=""nofollow"">Twitter for Péché', result[0]['fieldA']) data = StringIO.StringIO( 'fieldA\nrel=""nofollow"">Twitter for BlackBerry®') result = list( field_values_from_separated_file(data, delimiter='\t', quote_character='"')) assert_equal(u'rel=""nofollow"">Twitter for BlackBerry®', result[0]['fieldA']) # Bad binary data = StringIO.StringIO('fieldA\naaa\x80\x02\x03') result = list( field_values_from_separated_file(data, delimiter='\t', quote_character='"')) assert_equal(u'aaa\x02\x03', result[0]['fieldA'])
def test_get_ensemble(): # Non ascii data = StringIO.StringIO('fieldA\nrel=""nofollow"">Twitter for Péché') result = list(field_values_from_separated_file(data, delimiter='\t', quote_character='"')) assert_equal(u'rel=""nofollow"">Twitter for Péché', result[0]['fieldA']) data = StringIO.StringIO('fieldA\nrel=""nofollow"">Twitter for BlackBerry®') result = list(field_values_from_separated_file(data, delimiter='\t', quote_character='"')) assert_equal(u'rel=""nofollow"">Twitter for BlackBerry®', result[0]['fieldA']) # Bad binary data = StringIO.StringIO('fieldA\naaa\x80\x02\x03') result = list(field_values_from_separated_file(data, delimiter='\t', quote_character='"')) assert_equal(u'aaa\x02\x03', result[0]['fieldA'])
def update_data_from_hdfs(self, fs, collection_or_core_name, fields, path, data_type='separated', indexing_strategy='upload', **kwargs): """ Add hdfs path contents to index """ api = SolrApi(SOLR_URL.get(), self.user, SECURITY_ENABLED.get()) if indexing_strategy == 'upload': stats = fs.stats(path) if stats.size > MAX_UPLOAD_SIZE: raise PopupException(_('File size is too large to handle!')) else: # Get fields for filtering unique_key, fields = self.get_fields(collection_or_core_name) fields = [{'name': field, 'type': fields[field]['type']} for field in fields] fh = fs.open(path) if data_type == 'log': # Transform to JSON then update data = json.dumps([value for value in field_values_from_log(fh, fields)]) content_type = 'json' elif data_type == 'separated': data = json.dumps([value for value in field_values_from_separated_file(fh, kwargs.get('separator', ','), kwargs.get('quote_character', '"'), fields)], indent=2) content_type = 'json' else: raise PopupException(_('Could not update index. Unknown type %s') % data_type) fh.close() if not api.update(collection_or_core_name, data, content_type=content_type): raise PopupException(_('Could not update index. Check error logs for more info.')) else: raise PopupException(_('Could not update index. Indexing strategy %s not supported.') % indexing_strategy)
def _parse_fields(self, path, separator=',', quote_character='"', fieldtypes={}): with open(path) as fh: field_generator = utils.field_values_from_separated_file(fh, separator, quote_character) row = next(field_generator) field_names = row.keys() field_types = utils.get_field_types((row.values() for row in itertools.chain([row], field_generator)), iterations=51) return [{'name': field[0], 'type': field[0] in fieldtypes and fieldtypes[field[0]] or field[1]} for field in zip(field_names, field_types)]
def parse_fields(request): if request.method != 'POST': raise PopupException(_('POST request required.')) result = {'status': -1} source_type = request.POST.get('source') if source_type == 'file': content_type = request.POST.get('type') try: if content_type == 'separated': delimiter = request.POST.get('separator', ',') quote = request.POST.get('quote', '"') file_obj = request.fs.open(request.POST.get('path')) field_list = field_values_from_separated_file( file_obj, delimiter, quote) row = next(field_list) field_names = row.keys() field_types = get_field_types( (row.values() for row in itertools.chain([row], field_list)), iterations=51) file_obj.close() result['data'] = zip(field_names, field_types) result['status'] = 0 elif content_type == 'morphlines': morphlines = json.loads(request.POST.get('morphlines')) # Look for entries that take on the form %{SYSLOGTIMESTAMP:timestamp} field_results = re.findall( r'\%\{(?P<type>\w+)\:(?P<name>\w+)\}', morphlines['expression']) if field_results: result['data'] = [] for field_result in field_results: result['data'].append( (field_result[1], get_type_from_morphline_type(field_result[0]))) result['status'] = 0 else: result['status'] = 1 result['message'] = _('Could not detect any fields.') elif content_type == 'log': file_obj = request.fs.open(request.POST.get('path')) result['data'] = fields_from_log(file_obj) file_obj.close() result['status'] = 0 else: result['status'] = 1 result['message'] = _('Type %s not supported.') % content_type except Exception, e: LOG.exception(e.message) result['message'] = e.message
def parse_fields(request): if request.method != 'POST': raise PopupException(_('POST request required.')) result = {'status': -1} source_type = request.POST.get('source') if source_type == 'file': content_type = request.POST.get('type') try: if content_type == 'separated': delimiter = request.POST.get('separator', ',') quote = request.POST.get('quote', '"') file_obj = request.fs.open(request.POST.get('path')) field_list = field_values_from_separated_file(file_obj, delimiter, quote) row = next(field_list) field_names = row.keys() field_types = get_field_types((row.values() for row in itertools.chain([row], field_list)), iterations=51) file_obj.close() result['data'] = zip(field_names, field_types) result['status'] = 0 elif content_type == 'morphlines': morphlines = json.loads(request.POST.get('morphlines')) # Look for entries that take on the form %{SYSLOGTIMESTAMP:timestamp} field_results = re.findall(r'\%\{(?P<type>\w+)\:(?P<name>\w+)\}', morphlines['expression']) if field_results: result['data'] = [] for field_result in field_results: result['data'].append( (field_result[1], get_type_from_morphline_type(field_result[0])) ) result['status'] = 0 else: result['status'] = 1 result['message'] = _('Could not detect any fields.') elif content_type == 'log': file_obj = request.fs.open(request.POST.get('path')) result['data'] = fields_from_log(file_obj) file_obj.close() result['status'] = 0 else: result['status'] = 1 result['message'] = _('Type %s not supported.') % content_type except Exception, e: LOG.exception(e.message) result['message'] = e.message