def normalize_headers(header_row, long_headers, long_to_short_dict): """ Clean the headers (remove extra spaces and lowercase) and convert them to short headers if we're given long headers Args: header_row: an array of the file headers given long_headers: boolean indicating if we're using the long versions of the headers (True for long) long_to_short_dict: a dictionary containing a mapping from long headers to short ones for this file type Yields: A string containing the cleaned header name (converted to short version if long versions were provided and there is a mapping for that header). """ for header in header_row: header = FieldCleaner.clean_name(header) # Replace headers that don't match DB but are allowed by the broker with their DB matches if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe': header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe' elif header == 'facevalueloanguarantee': header = 'facevalueofdirectloanorloanguarantee' elif header == 'budgetauthorityavailableamounttotal_cpe': header = 'totalbudgetaryresources_cpe' elif header == 'correctionlatedeleteindicator': header = 'correctiondeleteindicator' elif header == 'place_of_performance_zip4': header = 'place_of_performance_zip4a' # yield the short header when applicable, otherwise yield the cleaned header, whatever it is if long_headers and header in long_to_short_dict: yield FieldCleaner.clean_name(long_to_short_dict[header]) else: yield header
def count_and_set_headers(self, csv_schema, header_row): """ Track how many times we've seen a field we were expecting and set self.expected_headers and self.flex_headers Args: csv_schema: list of FileColumn objects for this file type header_row: an array of the file headers given Returns: expected field dict {[expected field name]: [header count]) """ self.expected_headers = [] self.flex_headers = [] # Track how many times we've seen a field we were expecting. Keyed by the shorter, machine-readable column names expected_fields = OrderedDict() for schema in csv_schema: expected_fields[FieldCleaner.clean_name(schema.name_short)] = 0 for header_value in header_row: if header_value not in expected_fields: # Add flex headers to flex list if str(header_value).startswith("flex_"): self.flex_headers.append(header_value) else: self.flex_headers.append(None) # Allow unexpected headers, just mark the header as None so we skip it when reading self.expected_headers.append(None) else: self.flex_headers.append(None) self.expected_headers.append(header_value) expected_fields[header_value] += 1 return expected_fields
def count_and_set_headers(self, csv_schema, header_row): """Track how many times we've seen a field we were expecting and set self.expected_headers and self.flex_headers""" self.expected_headers = [] self.flex_headers = [] # Track how many times we've seen a field we were expecting. Keyed by the shorter, machine-readable column names expected_fields = {} for schema in csv_schema: expected_fields[FieldCleaner.clean_name(schema.name_short)] = 0 for header_value in header_row: if header_value not in expected_fields: # Add flex headers to flex list if str(header_value).startswith("flex_"): self.flex_headers.append(header_value) else: self.flex_headers.append(None) # Allow unexpected headers, just mark the header as None so we skip it when reading self.expected_headers.append(None) else: self.flex_headers.append(None) self.expected_headers.append(header_value) expected_fields[header_value] += 1 return expected_fields
def use_long_headers(header_row, long_to_short_dict): """Check to see if header contains long or short column names""" col_matches = 0 for value in header_row: if FieldCleaner.clean_name(value) in long_to_short_dict: col_matches += 1 # if most of column headers are in the long format, we'll treat the file as having long headers return col_matches > .5 * len(header_row)