def pipeline_restore_version(slug, file_id): pipeline = get_object_or_404(Pipeline, slug=slug) form = forms.RestoreVersionForm() is_form_valid = form.validate_on_submit() if is_form_valid and form.proceed.data != YES: return redirect( url_for( 'uploader_views.pipeline_data_upload', slug=pipeline.slug, ) ) data_file_latest = pipeline.latest_version file_contents_latest, _ = CSVParser.get_csv_sample( data_file_latest.data_file_url, data_file_latest.delimiter, data_file_latest.quote ) data_file_to_restore = get_object_or_404(PipelineDataFile, pipeline=pipeline, id=file_id) file_contents_to_restore, _ = CSVParser.get_csv_sample( data_file_to_restore.data_file_url, data_file_to_restore.delimiter, data_file_to_restore.quote, ) if is_form_valid: data_file_to_restore.state = DataUploaderFileState.VERIFIED.value data_file_to_restore.save() thread = process_pipeline_data_file(data_file_to_restore) thread.start() return redirect( url_for( 'uploader_views.pipeline_data_uploaded', slug=pipeline.slug, file_id=data_file_to_restore.id, ) ) return render_uploader_template( 'pipeline_restore_version.html', form=form, file_contents_latest=file_contents_latest, file_contents_to_restore=file_contents_to_restore, column_types_latest=dict(data_file_latest.column_types), column_types_to_restore=dict(data_file_to_restore.column_types), format_row_data=format_row_data, )
def test_get_s3_file_sample_invalid_lines(mock_tabulator_stream, app): csv_string = 'hello,goodbye\nbad\n1,2' _mock_stream_return_values(mock_tabulator_stream, [csv_string]) result, err = CSVParser.get_csv_sample('', ',') assert not result assert err == ( 'Unable to process CSV file: row 2 has a different number of ' 'data points (1) than there are column headers (2)')
def test_get_s3_file_sample_empty_file(mock_tabulator_stream, app): csv_string = ' ' _mock_stream_return_values(mock_tabulator_stream, [csv_string]) result, err = CSVParser.get_csv_sample('', ',') assert not result assert err == ( 'Unable to process CSV file: no headers found. The first line of the csv should ' 'contain the column headers.')
def test_get_s3_file_sample_when_empty_headers(mock_tabulator_stream, app): csv_string = 'hello,,goodbye,\n1,2,3,4\n5,6,7,8' _mock_stream_return_values(mock_tabulator_stream, [csv_string]) result, err = CSVParser.get_csv_sample('', ',') assert not err # column with empty header is ignored assert result == [('hello', 'integer', ['1', '5']), ('goodbye', 'integer', ['3', '7'])]
def test_get_s3_file_sample_when_extra_data_column(mock_tabulator_stream, app): csv_string = 'hello,goodbye\n1,2,3\n4,5,6\n7,8,9' _mock_stream_return_values(mock_tabulator_stream, [csv_string]) result, err = CSVParser.get_csv_sample('', ',') assert err == ( 'Unable to process CSV file: row 2 has a different number of ' 'data points (3) than there are column headers (2)') assert not result
def test_get_s3_file_sample(mock_tabulator_stream, csv_string, delimiter, quotechar, app): _mock_stream_return_values(mock_tabulator_stream, [csv_string], delimiter, quotechar) result, err = CSVParser.get_csv_sample('', delimiter, number_of_lines_sample=2) assert not err assert len(result) == 2
def test_get_s3_file_sample_with_invalid_header_names(mock_tabulator_stream, app): csv_string = 'spaces in header,weird :@£$% characters,Uppercase\n1,2,3\n4,5,6\n7,8,9' _mock_stream_return_values(mock_tabulator_stream, [csv_string]) result, err = CSVParser.get_csv_sample('', ',') assert err == ( 'Unable to process CSV file: column headers must start with a letter and may only ' 'contain lowercase letters, numbers, and underscores. Invalid headers: "spaces in ' 'header", "weird :@£$% characters", "Uppercase"') assert not result
def test_get_s3_file_sample_when_duplicate_header_names( mock_tabulator_stream, app): csv_string = 'hello,goodbye,goodbye\n1,2,3\n4,5,6\n7,8,9' _mock_stream_return_values(mock_tabulator_stream, [csv_string]) result, err = CSVParser.get_csv_sample('', ',') assert not err # duplicate headers are made unique assert result == [ ('hello', 'integer', ['1', '4', '7']), ('goodbye_1', 'integer', ['2', '5', '8']), ('goodbye_2', 'integer', ['3', '6', '9']), ]
def test_get_s3_file_sample_infer_data_types_big_sample( mock_tabulator_stream, app): csv_string = 'int,bool,text,datetime,date,numeric,mix\n' for i in range(1000): if i == 900: csv_string += 'text,text,text,text,text,text,text\n' continue csv_string += '2000,true,test,2006-11-26T16:30:00Z,2004-01-01,3.1,test\n' _mock_stream_return_values(mock_tabulator_stream, [csv_string]) result, err = CSVParser.get_csv_sample('', ',') assert not err for column, type, _ in result: assert type == 'text'
def _move_file_to_s3(file_url, organisation, dataset, delimiter, quote): bucket = app.config['s3']['bucket_url'] file_name = file_url.split('/')[-1] full_url = os.path.join(bucket, file_url) utf_8_byte_stream = CSVParser.get_csv_as_utf_8_byte_stream( full_url=full_url, delimiter=delimiter, quotechar=quote, ) file_info = FileInfo(file_url, utf_8_byte_stream) storage = StorageFactory.create(bucket) datasets_folder = app.config['s3']['datasets_folder'] target_file_url = f'{datasets_folder}/{organisation}/{dataset}/{file_name}' storage.write_file(target_file_url, file_info.data) file_info.data.seek(0) return file_info
def test_get_s3_file_sample_infer_data_types(mock_tabulator_stream, app): csv_string = ('int,bool,text,datetime,date,numeric,mix\n' '2000,true,test,2006-11-26T16:30:00Z,2004-01-01,3.1,test\n' '13,false,test,2018-12-18T12:10:00Z,1998-12-26,-1,-2') _mock_stream_return_values(mock_tabulator_stream, [csv_string]) result, err = CSVParser.get_csv_sample('', ',') assert not err assert result == [ ('int', 'integer', ['2000', '13']), ('bool', 'boolean', ['true', 'false']), ('text', 'text', ['test', 'test']), ('datetime', 'timestamp', ['2006-11-26T16:30:00Z', '2018-12-18T12:10:00Z']), ('date', 'date', ['2004-01-01', '1998-12-26']), ('numeric', 'numeric', ['3.1', '-1']), ('mix', 'text', ['test', '-2']), ]
def test_get_s3_file_sample_with_no_data(mock_tabulator_stream, app): csv_string = 'hello,goodbye' _mock_stream_return_values(mock_tabulator_stream, [csv_string]) result, err = CSVParser.get_csv_sample('', ',') assert err == ('Unable to process CSV file: no data found') assert not result
def test_make_unique_headers(headers, unique_headers): assert CSVParser.make_unique_headers(headers) == unique_headers
def pipeline_data_verify(slug, file_id): pipeline = get_object_or_404(Pipeline, slug=slug) pipeline_data_file = get_object_or_404(PipelineDataFile, pipeline=pipeline, id=file_id) form = forms.VerifyDataFileForm() is_form_valid = form.validate_on_submit() if is_form_valid and form.proceed.data != YES: pipeline_data_file.delete() delete_file(pipeline_data_file) return redirect(url_for('uploader_views.pipeline_select')) new_file_contents, new_file_err = CSVParser.get_csv_sample( pipeline_data_file.data_file_url, pipeline_data_file.delimiter, pipeline_data_file.quote ) current_file_contents, current_column_types, missing_headers = None, None, set() if pipeline.latest_version: data_file_latest = pipeline.latest_version current_file_contents, current_file_err = CSVParser.get_csv_sample( data_file_latest.data_file_url, data_file_latest.delimiter, data_file_latest.quote ) missing_headers = get_missing_headers( current_version=current_file_contents, new_version=new_file_contents ) current_column_types = dict(data_file_latest.column_types) if is_form_valid: selected_column_types = [ (column, request.form[column]) for column, _, _ in new_file_contents ] pipeline_data_file.column_types = selected_column_types pipeline_data_file.state = DataUploaderFileState.VERIFIED.value pipeline_data_file.save() thread = process_pipeline_data_file(pipeline_data_file) thread.start() return redirect( url_for( 'uploader_views.pipeline_data_uploaded', slug=pipeline.slug, file_id=pipeline_data_file.id, ) ) if new_file_err is None: uploaded_columns = set(x[0] for x in new_file_contents) error_message = check_for_reserved_column_names(pipeline_data_file, uploaded_columns) if error_message: new_file_contents = None new_file_err = error_message if new_file_err is not None: pipeline_data_file.state = DataUploaderFileState.FAILED.value pipeline_data_file.error_message = new_file_err pipeline_data_file.save() form.errors['non_field_errors'] = [new_file_err] return render_uploader_template( 'pipeline_data_verify.html', pipeline=pipeline, new_file_contents=new_file_contents, current_file_contents=current_file_contents, current_column_types=current_column_types, data_types=DataUploaderDataTypes.values(), format_row_data=format_row_data, form=form, missing_headers=missing_headers, )