# If the dataframe is not empty, then we found a good one if len(data) > 10 and len(data['RegionName'].unique()) > 3: break # Convert all dates to ISO format data['Date'] = data['Date'].apply(lambda date: date.date().isoformat()) def parenthesis(x): regexp = r'\((\d+)\)' return re.sub(regexp, '', x), (re.search(regexp, x) or [None, None])[1] # Get the confirmed and deaths data from the table data['Confirmed'] = data['Value'].apply( lambda x: safe_int_cast(parenthesis(x)[0])) data['Deaths'] = data['Value'].apply( lambda x: safe_int_cast(parenthesis(x)[1])) def aggregate_region_values(group: DataFrame): non_null = [value for value in group if not (isna(value) or isnull(value))] return None if not non_null else sum(non_null) # Add up all the rows with same Date and RegionName data = data.sort_values(['Date', 'RegionName']) data = data.drop(columns=['Value']).groupby(['RegionName', 'Date' ]).agg(aggregate_region_values) data = data.reset_index().sort_values(['Date', 'RegionName'])
def fix_temp(value: int): value = safe_int_cast(value) return None if value is None else '%.1f' % (value / 10.)
# Parse into datetime object, drop if not possible df['Date'] = df['Date'].apply(lambda date: safe_datetime_parse(date, date_format)) df = df[~df['Date'].isna()] # Convert all dates to ISO format df['Date'] = df['Date'].apply(lambda date: date.date().isoformat()) def parenthesis(x): regexp = r'\((\d+)\)' return re.sub(regexp, '', x), (re.search(regexp, x) or [None, None])[1] # Get the confirmed and deaths data from the table df['Confirmed'] = df['Value'].apply(lambda x: safe_int_cast(parenthesis(x)[0])) df['Deaths'] = df['Value'].apply(lambda x: safe_int_cast(parenthesis(x)[1])) def aggregate_region_values(group: DataFrame): non_null = [value for value in group if not (isna(value) or isnull(value))] return None if not non_null else sum(non_null) # Add up all the rows with same Date and RegionName df = df.sort_values(['Date', 'RegionName']) df = df.drop(columns=['Value']).groupby(['RegionName', 'Date']).agg(aggregate_region_values) df = df.reset_index().sort_values(['Date', 'RegionName']) # Compute cumsum of the values region by region value_columns = ['Confirmed', 'Deaths']