def _guess_field_types(self, sample_rows): field_type_guesses = [] num_columns = self._num_columns for col in range(num_columns): column_samples = [sample_row[col] for sample_row in sample_rows if len(sample_row) > col] field_type_guess = guess_field_type_from_samples(column_samples) field_type_guesses.append(field_type_guess) return field_type_guesses
def _guess_field_types(self, sample_rows): field_type_guesses = [] num_columns = self._num_columns for col in range(num_columns): column_samples = [sample_row[col] for sample_row in sample_rows if len(sample_row) > col] field_type_guess = guess_field_type_from_samples(column_samples) field_type_guesses.append(field_type_guess) return field_type_guesses
def guess_field_types(request): file_format = json.loads(request.POST.get('fileFormat', '{}')) if file_format['inputFormat'] == 'localfile': path = urllib_unquote(file_format['path']) with open(path, 'r') as local_file: reader = csv.reader(local_file) csv_data = list(reader) if file_format['format']['hasHeader']: sample = csv_data[1:5] column_row = [ re.sub('[^0-9a-zA-Z]+', '_', col) for col in csv_data[0] ] else: sample = csv_data[:4] column_row = [ 'field_' + str(count + 1) for count, col in enumerate(sample[0]) ] field_type_guesses = [] for count, col in enumerate(column_row): column_samples = [ sample_row[count] for sample_row in sample if len(sample_row) > count ] field_type_guess = guess_field_type_from_samples( column_samples) field_type_guesses.append(field_type_guess) columns = [ Field(column_row[count], field_type_guesses[count]).to_dict() for count, col in enumerate(column_row) ] format_ = {'columns': columns, 'sample': sample} elif file_format['inputFormat'] == 'file': indexer = MorphlineIndexer(request.user, request.fs) path = urllib_unquote(file_format["path"]) if path[-3:] == 'xls' or path[-4:] == 'xlsx': path = excel_to_csv_file_name_change(path) stream = request.fs.open(path) encoding = check_encoding(stream.read(10000)) LOG.debug('File %s encoding is %s' % (path, encoding)) stream.seek(0) _convert_format(file_format["format"], inverse=True) format_ = indexer.guess_field_types({ "file": { "stream": stream, "name": path }, "format": file_format['format'] }) # Note: Would also need to set charset to table (only supported in Hive) if 'sample' in format_ and format_['sample']: format_['sample'] = escape_rows(format_['sample'], nulls_only=True, encoding=encoding) for col in format_['columns']: col['name'] = smart_unicode(col['name'], errors='replace', encoding=encoding) elif file_format['inputFormat'] == 'table': sample = get_api(request, { 'type': 'hive' }).get_sample_data({'type': 'hive'}, database=file_format['databaseName'], table=file_format['tableName']) db = dbms.get(request.user) table_metadata = db.get_table(database=file_format['databaseName'], table_name=file_format['tableName']) format_ = { "sample": sample['rows'][:4], "columns": [ Field(col.name, HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type, 'string')).to_dict() for col in table_metadata.cols ] } elif file_format['inputFormat'] == 'query': query_id = file_format['query']['id'] if file_format['query'].get( 'id') else file_format['query'] notebook = Notebook(document=Document2.objects.document( user=request.user, doc_id=query_id)).get_data() snippet = notebook['snippets'][0] db = get_api(request, snippet) if file_format.get('sampleCols'): columns = file_format.get('sampleCols') sample = file_format.get('sample') else: snippet['query'] = snippet['statement'] try: sample = db.fetch_result(notebook, snippet, 4, start_over=True)['rows'][:4] except Exception as e: LOG.warning( 'Skipping sample data as query handle might be expired: %s' % e) sample = [[], [], [], [], []] columns = db.autocomplete(snippet=snippet, database='', table='') columns = [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'], 'string')).to_dict() for col in columns['extended_columns'] ] format_ = { "sample": sample, "columns": columns, } elif file_format['inputFormat'] == 'rdbms': api = _get_api(request) sample = api.get_sample_data(None, database=file_format['rdbmsDatabaseName'], table=file_format['tableName']) format_ = { "sample": list(sample['rows'])[:4], "columns": [ Field(col['name'], col['type']).to_dict() for col in sample['full_headers'] ] } elif file_format['inputFormat'] == 'stream': if file_format['streamSelection'] == 'kafka': data = get_topic_data(request.user, file_format.get('kafkaSelectedTopics')) kafkaFieldNames = [col['name'] for col in data['full_headers']] kafkaFieldTypes = [col['type'] for col in data['full_headers']] topics_data = data['rows'] format_ = { "sample": topics_data, "columns": [ Field(col, 'string', unique=False).to_dict() for col in kafkaFieldNames ] } elif file_format['streamSelection'] == 'flume': if 'hue-httpd/access_log' in file_format['channelSourcePath']: columns = [{ 'name': 'id', 'type': 'string', 'unique': True }, { 'name': 'client_ip', 'type': 'string' }, { 'name': 'time', 'type': 'date' }, { 'name': 'request', 'type': 'string' }, { 'name': 'code', 'type': 'plong' }, { 'name': 'bytes', 'type': 'plong' }, { 'name': 'method', 'type': 'string' }, { 'name': 'url', 'type': 'string' }, { 'name': 'protocol', 'type': 'string' }, { 'name': 'app', 'type': 'string' }, { 'name': 'subapp', 'type': 'string' }] else: columns = [{'name': 'message', 'type': 'string'}] format_ = { "sample": [['...'] * len(columns)] * 4, "columns": [ Field(col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string'), unique=col.get('unique')).to_dict() for col in columns ] } elif file_format['inputFormat'] == 'connector': if file_format['connectorSelection'] == 'sfdc': sf = Salesforce(username=file_format['streamUsername'], password=file_format['streamPassword'], security_token=file_format['streamToken']) table_metadata = [{ 'name': column['name'], 'type': column['type'] } for column in sf.restful('sobjects/%(streamObject)s/describe/' % file_format)['fields']] query = 'SELECT %s FROM %s LIMIT 4' % (', '.join( [col['name'] for col in table_metadata]), file_format['streamObject']) print(query) try: records = sf.query_all(query) except SalesforceRefusedRequest as e: raise PopupException(message=str(e)) format_ = { "sample": [list(row.values())[1:] for row in records['records']], "columns": [ Field( col['name'], HiveFormat.FIELD_TYPE_TRANSLATE.get( col['type'], 'string')).to_dict() for col in table_metadata ] } else: raise PopupException( _('Connector format not recognized: %(connectorSelection)s') % file_format) else: raise PopupException( _('Input format not recognized: %(inputFormat)s') % file_format) return JsonResponse(format_)