예제 #1
0
    def test_prep_df_for_csv_output_include_index(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "date": {
                    "type": "date",
                    "index": 1,
                },
                "time": {
                    "type": "time",
                    "index": 2,
                },
                "timetz": {
                    "type": "timetz",
                    "index": 3,
                },
            }
        }
        records_format = DelimitedRecordsFormat(variant='bluelabs')
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        # us_eastern = pytz.timezone('US/Eastern')
        data = {
            'time': [
                pd.Timestamp(year=1970, month=1, day=1,
                             hour=12, minute=33, second=53, microsecond=1234)
            ],
            # timetz is not well supported in records mover yet.  For
            # instance, specifying how it's turned into a CSV is not
            # currently part of the records spec:
            #
            #   https://github.com/bluelabsio/records-mover/issues/76
            #
            # In addition, Vertica suffers from a driver limitation:
            #
            #   https://github.com/bluelabsio/records-mover/issues/77
            #
            # 'timetz': [
            #     us_eastern.localize(pd.Timestamp(year=1970, month=1, day=1,
            #                                      hour=12, minute=33, second=53,
            #                                      microsecond=1234)),
            # ],
        }
        df = pd.DataFrame(data,
                          index=[pd.Timestamp(year=1970, month=1, day=1)],
                          columns=['time', 'timetz'])

        new_df = prep_df_for_csv_output(df=df,
                                        include_index=True,
                                        records_schema=records_schema,
                                        records_format=records_format,
                                        processing_instructions=processing_instructions)
        self.assertEqual(new_df.index[0], '1970-01-01')
        self.assertEqual(new_df['time'][0], '12:33:53')
        # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
        self.assertIsNotNone(new_df)
예제 #2
0
 def save_df(df: 'DataFrame', output_filename: str) -> None:
     df = prep_df_for_csv_output(
         df,
         include_index=self.include_index,
         records_schema=records_schema,
         records_format=delimited_records_format,
         processing_instructions=processing_instructions)
     df.to_csv(path_or_buf=output_filename,
               index=self.include_index,
               **options)
     logger.info('CSV file written')
예제 #3
0
    def test_timeonlyformat(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "time_as_timestamp": {
                    "type": "time",
                    "index": 1,
                },
                "time_as_time": {
                    "type": "time",
                    "index": 2,
                },
            }
        }
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        for timeonlyformat in TIMEONLY_CASES:
            records_format = DelimitedRecordsFormat(variant='bluelabs',
                                                    hints={
                                                        'timeonlyformat': timeonlyformat,
                                                    })
            # us_eastern = pytz.timezone('US/Eastern')
            time_as_timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
                                             hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
                                             second=SAMPLE_SECOND)
            time_as_time = datetime.time(hour=SAMPLE_HOUR,
                                         minute=SAMPLE_MINUTE,
                                         second=SAMPLE_SECOND)
            data = {
                'time_as_timestamp': [
                    time_as_timestamp
                ],
                'time_as_time': [
                    time_as_time
                ],
            }
            df = pd.DataFrame(data, columns=['time_as_timestamp', 'time_as_time'])

            new_df = prep_df_for_csv_output(df=df,
                                            include_index=False,
                                            records_schema=records_schema,
                                            records_format=records_format,
                                            processing_instructions=processing_instructions)
            self.assertEqual(new_df['time_as_timestamp'][0],
                             create_sample(timeonlyformat),
                             timeonlyformat)
            self.assertEqual(new_df['time_as_time'][0],
                             create_sample(timeonlyformat),
                             timeonlyformat)
            # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
            self.assertIsNotNone(new_df)
예제 #4
0
    def test_dateformat(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "date_as_timestamp": {
                    "type": "date",
                    "index": 1,
                },
                "date_as_date": {
                    "type": "date",
                    "index": 1,
                },
            }
        }
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        for dateformat in DATE_CASES:
            records_format = DelimitedRecordsFormat(variant='bluelabs',
                                                    hints={
                                                        'dateformat': dateformat
                                                    })
            # us_eastern = pytz.timezone('US/Eastern')
            data = {
                'date_as_timestamp': [
                    pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY)
                ],
                'date_as_date': [
                    datetime.date(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY)
                ],
            }
            df = pd.DataFrame(data,
                              columns=['date_as_timestamp', 'date_as_date'])

            new_df = prep_df_for_csv_output(df=df,
                                            include_index=False,
                                            records_schema=records_schema,
                                            records_format=records_format,
                                            processing_instructions=processing_instructions)
            self.assertEqual(new_df['date_as_timestamp'][0],
                             create_sample(dateformat))
            self.assertEqual(new_df['date_as_date'][0],
                             create_sample(dateformat))
            self.assertIsNotNone(new_df)
예제 #5
0
    def test_datetimeformat(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "datetimez": {
                    "type": "datetime",
                    "index": 1,
                },
            }
        }
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        for datetimeformat in DATETIME_CASES:
            records_format = DelimitedRecordsFormat(variant='bluelabs',
                                                    hints={
                                                        'datetimeformat': datetimeformat
                                                    })
            # us_eastern = pytz.timezone('US/Eastern')
            timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
                                     hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
                                     second=SAMPLE_SECOND)

            data = {
                'datetime': [
                    timestamp
                ],
            }
            df = pd.DataFrame(data, columns=['datetime'])

            new_df = prep_df_for_csv_output(df=df,
                                            include_index=False,
                                            records_schema=records_schema,
                                            records_format=records_format,
                                            processing_instructions=processing_instructions)
            # No conversion is done of datetime as pandas' CSV
            # outputter handles it properly, so we should expect the
            # original again
            self.assertEqual(new_df['datetime'][0],
                             timestamp,
                             create_sample(datetimeformat))
            # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
            self.assertIsNotNone(new_df)
예제 #6
0
        def write_dfs(path_or_buf: Union[str, IO[str]]) -> int:
            first_row = True
            move_count = 0
            for df in dfs_source.dfs:
                logger.info("Appending from dataframe...")

                # Include the header at most once in the file
                include_header_row = options['header'] and first_row
                first_row = False
                options['header'] = include_header_row
                df = prep_df_for_csv_output(df,
                                            include_index=dfs_source.include_index,
                                            records_schema=records_schema,
                                            records_format=records_format,
                                            processing_instructions=processing_instructions)
                df.to_csv(path_or_buf=path_or_buf,
                          mode="a",
                          index=dfs_source.include_index,
                          **options)
                move_count += len(df.index)
            return move_count