def _are_contents_empty(self, args: GcsfsIngestArgs, contents_handle: GcsfsFileContentsHandle) -> bool: """Returns true if the CSV file is emtpy, i.e. it contains no non-header rows. """ delegate = ReadOneGcsfsCsvReaderDelegate() self.csv_reader.streaming_read(args.file_path, delegate=delegate, chunk_size=1, skiprows=1) return delegate.df is None
def _get_validated_columns( self, path: GcsfsFilePath, file_config: DirectIngestRawFileConfig) -> List[str]: """Returns a list of normalized column names for the raw data file at the given path.""" # TODO(#3020): We should not derive the columns from what we get in the uploaded raw data CSV - we should # instead define the set of columns we expect to see in each input CSV (with mandatory documentation) and update # this function to make sure that the columns in the CSV is a strict subset of expected columns. This will allow # to gracefully any raw data re-imports where a new column gets introduced in a later file. delegate = ReadOneGcsfsCsvReaderDelegate() self.csv_reader.streaming_read( path, delegate=delegate, chunk_size=1, nrows=1, **self._common_read_csv_kwargs(file_config), ) df = delegate.df if not isinstance(df, pd.DataFrame): raise ValueError(f"Unexpected type for DataFrame: [{type(df)}]") columns = self.remove_column_non_printable_characters(df.columns) # Strip whitespace from head/tail of column names columns = [c.strip() for c in columns] normalized_columns = set() for i, column_name in enumerate(columns): if not column_name: raise ValueError( f"Found empty column name in [{file_config.file_tag}]") column_name = self._convert_non_allowable_bq_column_chars( column_name) # BQ doesn't allow column names to begin with a number, so we prepend an underscore in that case if column_name[0] in string.digits: column_name = "_" + column_name if column_name in normalized_columns: raise ValueError( f"Multiple columns with name [{column_name}] after normalization." ) normalized_columns.add(column_name) columns[i] = column_name return columns
def _file_meets_file_line_limit(self, line_limit: int, path: GcsfsFilePath) -> bool: delegate = ReadOneGcsfsCsvReaderDelegate() # Read a chunk up to one line bigger than the acceptable size self.csv_reader.streaming_read(path, delegate=delegate, chunk_size=(line_limit + 1)) if delegate.df is None: # If the file is empty, it's fine. return True # If length of the only chunk is less than or equal to the acceptable # size, file meets line limit. return len(delegate.df) <= line_limit
def _get_validated_columns( self, path: GcsfsFilePath, file_config: DirectIngestRawFileConfig) -> List[str]: """Returns a list of normalized column names for the raw data file at the given path.""" # TODO(3020): We should not derive the columns from what we get in the uploaded raw data CSV - we should instead # define the set of columns we expect to see in each input CSV (with mandatory documentation) and update # this function to make sure that the columns in the CSV is a strict subset of expected columns. This will allow # to gracefully any raw data re-imports where a new column gets introduced in a later file. delegate = ReadOneGcsfsCsvReaderDelegate() self.csv_reader.streaming_read( path, delegate=delegate, chunk_size=1, nrows=1, **self._common_read_csv_kwargs(file_config)) df = delegate.df if not isinstance(df, pd.DataFrame): raise ValueError(f'Unexpected type for DataFrame: [{type(df)}]') columns = self.remove_column_non_printable_characters(df.columns) # Strip whitespace from head/tail of column names columns = [c.strip() for c in columns] for column_name in columns: if not column_name: raise ValueError( f'Found empty column name in [{file_config.file_tag}]') non_allowable_chars = self._get_non_allowable_bq_column_chars( column_name) if non_allowable_chars: # TODO(3020): Some regions (US_MO) are known to have unsupported chars in their column names - will need # to implement how we reliably convert these column names. raise ValueError( f'Column [{column_name}] for file has non-allowable characters {non_allowable_chars}.' ) return columns
def _get_validated_columns( self, path: GcsfsFilePath, file_config: DirectIngestRawFileConfig) -> List[str]: """Returns a list of normalized column names for the raw data file at the given path.""" # TODO(#3807): We should not derive the columns from what we get in the uploaded raw data CSV - we should # instead define the set of columns we expect to see in each input CSV (with mandatory documentation) and update # this function to make sure that the columns in the CSV is a strict subset of expected columns. This will allow # to gracefully any raw data re-imports where a new column gets introduced in a later file. delegate = ReadOneGcsfsCsvReaderDelegate() self.csv_reader.streaming_read( path, delegate=delegate, chunk_size=1, encodings_to_try=file_config.encodings_to_try(), nrows=1, **self._common_read_csv_kwargs(file_config), ) df = delegate.df if not isinstance(df, pd.DataFrame): raise ValueError(f"Unexpected type for DataFrame: [{type(df)}]") columns = self.remove_column_non_printable_characters(df.columns) # Strip whitespace from head/tail of column names columns = [c.strip() for c in columns] normalized_columns = set() for i, column_name in enumerate(columns): if not column_name: raise ValueError( f"Found empty column name in [{file_config.file_tag}]") column_name = self._convert_non_allowable_bq_column_chars( column_name) # BQ doesn't allow column names to begin with a number, so we prepend an underscore in that case if column_name[0] in string.digits: column_name = "_" + column_name # If the capitalization of the column name doesn't match the capitalization # listed in the file config, update the capitalization. if column_name not in file_config.columns: caps_normalized_col = file_config.caps_normalized_col( column_name) if caps_normalized_col: column_name = caps_normalized_col if column_name in normalized_columns: raise ValueError( f"Multiple columns with name [{column_name}] after normalization." ) normalized_columns.add(column_name) columns[i] = column_name if len(normalized_columns) == 1: # A single-column file is almost always indicative of a parsing error. If # this column name is not registered in the file config, we throw. column = one(normalized_columns) if column not in file_config.columns: raise ValueError( f"Found only one column: [{column}]. Columns likely did not " f"parse properly. Are you using the correct separator and encoding " f"for this file? If this file really has just one column, the " f"column name must be registered in the raw file config before " f"upload.") return columns