def test_to_normalized_unprocessed_file_name(self) -> None:
     original_file_name = 'test_file_tag.csv'
     expected_file_name = 'unprocessed_2019-08-12T00:00:00:000000_raw_test_file_tag.csv'
     self.assertEqual(
         expected_file_name,
         to_normalized_unprocessed_file_name(
             original_file_name, GcsfsDirectIngestFileType.RAW_DATA,
             datetime.datetime(2019, 8, 12, 0, 0, 0)))
    def _upload_raw_data_file_to_bq(self, local_file_path: str):
        """Attempts to upload the given |local_file_path| to all relevant tables in BQ."""
        logging.info(
            '\n\n\n ============== Beginning processing for file %s ================ \n\n\n',
            local_file_path)
        _, file_name = os.path.split(local_file_path)
        file_tag, _ = os.path.splitext(file_name)

        normalized_file_name = to_normalized_unprocessed_file_name(
            file_name=file_name,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
            dt=self.import_time)
        file_id_processed_tuple = get_file_id_and_processed_status_for_file(
            metadata_type=MetadataType.RAW,
            region_code=self.region_code,
            client=self.client,
            project_id=self.project_id,
            normalized_file_name=normalized_file_name)

        file_in_metadata = file_id_processed_tuple[0] is not None
        file_id = file_id_processed_tuple[0] if file_id_processed_tuple[0] is not None \
            else get_next_available_file_id(
                metadata_type=MetadataType.RAW, client=self.client, project_id=self.project_id)
        file_already_processed = file_id_processed_tuple[1]

        if file_already_processed:
            logging.warning(
                'File %s is already marked as processed. Skipping file processing.',
                normalized_file_name)
            return

        df = self._get_dataframe_from_csv_with_extra_cols(
            local_file_path=local_file_path, file_id=file_id)

        raw_data_dataset = f'{self.region_code.lower()}_raw_data'
        logging.info('\n\nLoaded dataframe has %d rows\n', df.shape[0])
        logging.info(
            '\n\nLoaded dataframe with intent of uploading to %s.%s: \n\n%s',
            raw_data_dataset, file_tag, str(df.head()))

        if not self.dry_run:
            i = input('Continue? [y/n]: ')
            if i.upper() != 'Y':
                return

        self._append_df_to_table(dataset=raw_data_dataset,
                                 table_name=file_tag,
                                 df=df)
        self._mark_file_as_processed(file_id=file_id,
                                     file_tag=file_tag,
                                     normalized_file_name=normalized_file_name,
                                     processed_time=datetime.datetime.now(),
                                     file_exists_in_metadata=file_in_metadata)
示例#3
0
    def _generate_output_path(self,
                              ingest_view_export_args: GcsfsIngestViewExportArgs,
                              metadata: DirectIngestIngestFileMetadata) -> GcsfsFilePath:
        ingest_view = self.ingest_views_by_tag[ingest_view_export_args.ingest_view_name]
        if not metadata.normalized_file_name:
            output_file_name = to_normalized_unprocessed_file_name(
                f'{ingest_view.file_tag}.csv',
                GcsfsDirectIngestFileType.INGEST_VIEW,
                dt=ingest_view_export_args.upper_bound_datetime_to_export
            )
        else:
            output_file_name = metadata.normalized_file_name

        return GcsfsFilePath.from_directory_and_file_name(self.ingest_directory_path, output_file_name)