def write(self, df): row_count, col_count = df.shape # CSV Chunksize needs to be one more than block_size, so if we do get a UnicodeDecodeError, no rows will have been re-written chunksize = row_count + 1 try: # Test whether the data is encoding - we need to test for this here # before calling to_csv which writes line by line - so lines preceding an error # Will cause duplicate key constraints when imported into the DB json.dumps(df.to_dict(outtype='records')).encode('ascii') except UnicodeDecodeError: # Encoding failed - rather than ditch the whole batch, loop through and write each individually, logging an error for failures # Some of these failures are just corrupt records in KE EMu - for example record irn has # For example: DarFieldNumber:1=ÃÆâ # Loop through each row for i in range(row_count): # Get one row of the dataframe as new frame df_row = df[i:i+1] try: # Try to write the row df_row.to_csv(self.path, mode='a', columns=self.columns.keys(), index=False, header=False, encoding='utf-8') except UnicodeDecodeError: # On failure, log an error with the _id of that row log.critical('UTF8 Encoding error for record irn=%s', df_row.iloc[-1]['_id']) else: # Batch is good to write to CSV df.to_csv(self.path, chunksize=chunksize, mode='a', columns=self.columns.keys(), index=False, header=False, encoding='utf-8')
def write(self, df): log.info("Saving records to CKAN resource %s", self.resource_id) # Convert all empty/null values to None - so will be NULL values in postgres # Ensure any float fields with value 0.0 are actually None for col, np_type in self.columns.iteritems(): if np_type.startswith('float'): df[col][df[col] == 0.0] = None else: # BUGFIX: Multimedia fields are being populated with empty string rather than NULL df[col][df[col].astype(str) == ''] = None # Loop through all the dataframe columns, removing internal ones (fields starting with _) for col in df: if col.startswith('_'): df.drop(col, axis=1, inplace=True) # Convert all NaN to None df = df.where(pd.notnull(df), None) # Convert records to dictionary records = df.to_dict(outtype='records') datastore_params = { 'resource_id': self.resource_id, 'records': records, 'force': True # 'primary_key': '_id' } # Check that the data doesn't contain invalid chars try: json.dumps(datastore_params).encode('ascii') except UnicodeDecodeError: # At least one of the records contain invalid chars # Loop through, validating each of the records validated_records = [] for i, record in enumerate(datastore_params['records']): try: json.dumps(record).encode('ascii') except UnicodeDecodeError: log.critical('Error encoding record: %s', ' '.join(['%s=%s' % (field, value) for field, value in record.iteritems() if value])) else: validated_records.append(record) datastore_params['records'] = validated_records self.remote_ckan.action.datastore_upsert(**datastore_params)
def write(self, df): row_count, col_count = df.shape # CSV Chunksize needs to be one more than block_size, so if we do get a UnicodeDecodeError, no rows will have been re-written chunksize = row_count + 1 try: # Test whether the data is encoding - we need to test for this here # before calling to_csv which writes line by line - so lines preceding an error # Will cause duplicate key constraints when imported into the DB json.dumps(df.to_dict(outtype='records')).encode('ascii') except UnicodeDecodeError: # Encoding failed - rather than ditch the whole batch, loop through and write each individually, logging an error for failures # Some of these failures are just corrupt records in KE EMu - for example record irn has # For example: DarFieldNumber:1=ÃÆâ # Loop through each row for i in range(row_count): # Get one row of the dataframe as new frame df_row = df[i:i + 1] try: # Try to write the row df_row.to_csv(self.path, mode='a', columns=self.columns.keys(), index=False, header=False, encoding='utf-8') except UnicodeDecodeError: # On failure, log an error with the _id of that row log.critical('UTF8 Encoding error for record irn=%s', df_row.iloc[-1]['_id']) else: # Batch is good to write to CSV df.to_csv(self.path, chunksize=chunksize, mode='a', columns=self.columns.keys(), index=False, header=False, encoding='utf-8')