예제 #1
0
    def from_fileobjs(
            fileobjs: List[IO[bytes]], records_format: BaseRecordsFormat,
            processing_instructions: ProcessingInstructions
    ) -> 'RecordsSchema':
        """
        Sniffs
        """
        from records_mover.records.delimited import stream_csv
        from records_mover.pandas import purge_unnamed_unused_columns

        if len(fileobjs) != 1:
            # https://github.com/bluelabsio/records-mover/issues/84
            raise NotImplementedError(
                'Cannot currently sniff schema from multiple '
                'files--please provide explicit schema JSON')
        fileobj = fileobjs[0]
        if not fileobj.seekable():
            raise NotImplementedError(
                'Cannot currently sniff schema from a pure stream--'
                'please save file to disk and load from there or '
                'provide explicit schema JSON')
        with stream_csv(fileobj,
                        records_format.hints) as reader:  # type: ignore
            # Parse schema from sample df

            sample_row_count = processing_instructions.max_inference_rows
            if sample_row_count is not None:
                df = reader.get_chunk(sample_row_count)
            else:
                df = reader.read()

            fileobj.seek(0)

            df = purge_unnamed_unused_columns(df)
            schema = RecordsSchema.from_dataframe(df,
                                                  processing_instructions,
                                                  include_index=False)

            schema = schema.refine_from_dataframe(
                df, processing_instructions=processing_instructions)
            return schema
예제 #2
0
 def test_stream_filename(self,
                          mock_io,
                          mock_TextFileReader,
                          mock_read_csv):
     mock_field_delimiter = Mock(name='field_delimiter')
     mock_hints = {
         'field-delimiter': mock_field_delimiter,
     }
     mock_filepath_or_buffer = 'my_filename'
     with stream_csv(mock_filepath_or_buffer, mock_hints) as out:
         mock_read_csv.assert_called_with(mock_filepath_or_buffer,
                                          compression=None,
                                          encoding='utf-8',
                                          engine='python',
                                          escapechar=None,
                                          header='infer',
                                          iterator=True,
                                          sep=mock_field_delimiter)
         mock_io.TextIOWrapper.assert_not_called()
         self.assertEqual(out, mock_read_csv.return_value)
     mock_read_csv.return_value.close.assert_called()