def from_fileobjs( fileobjs: List[IO[bytes]], records_format: BaseRecordsFormat, processing_instructions: ProcessingInstructions ) -> 'RecordsSchema': """ Sniffs """ from records_mover.records.delimited import stream_csv from records_mover.pandas import purge_unnamed_unused_columns if len(fileobjs) != 1: # https://github.com/bluelabsio/records-mover/issues/84 raise NotImplementedError( 'Cannot currently sniff schema from multiple ' 'files--please provide explicit schema JSON') fileobj = fileobjs[0] if not fileobj.seekable(): raise NotImplementedError( 'Cannot currently sniff schema from a pure stream--' 'please save file to disk and load from there or ' 'provide explicit schema JSON') with stream_csv(fileobj, records_format.hints) as reader: # type: ignore # Parse schema from sample df sample_row_count = processing_instructions.max_inference_rows if sample_row_count is not None: df = reader.get_chunk(sample_row_count) else: df = reader.read() fileobj.seek(0) df = purge_unnamed_unused_columns(df) schema = RecordsSchema.from_dataframe(df, processing_instructions, include_index=False) schema = schema.refine_from_dataframe( df, processing_instructions=processing_instructions) return schema
def test_stream_filename(self, mock_io, mock_TextFileReader, mock_read_csv): mock_field_delimiter = Mock(name='field_delimiter') mock_hints = { 'field-delimiter': mock_field_delimiter, } mock_filepath_or_buffer = 'my_filename' with stream_csv(mock_filepath_or_buffer, mock_hints) as out: mock_read_csv.assert_called_with(mock_filepath_or_buffer, compression=None, encoding='utf-8', engine='python', escapechar=None, header='infer', iterator=True, sep=mock_field_delimiter) mock_io.TextIOWrapper.assert_not_called() self.assertEqual(out, mock_read_csv.return_value) mock_read_csv.return_value.close.assert_called()