def add_epi_dates(df): ''' Adds epi_week and epi_year to dataframe. ''' df['epi_week'] = df.date.apply(lambda x: Week.fromdate(x).week) df['epi_year'] = df.date.apply(lambda x: Week.fromdate(x).year) df = df[['epi_week', 'epi_year', 'date', 'location', 'location_name', 'cum_death', 'inc_death', 'cum_case', 'inc_case']] return df
def get_newunit(value): if value[0].isdecimal(): date = pd.to_datetime(value) if unit == 'week': epiweek = str(Week.fromdate(date, system="cdc")) # get epiweeks year, week = epiweek[:4], epiweek[-2:] if weekasdate in ['start', 'end']: if weekasdate == 'start': epiweek = str(Week(int(year), int(week)).startdate()) else: epiweek = str(Week(int(year), int(week)).enddate()) else: epiweek = year + '_' + 'EW' + week if epiweek not in time_cols: time_cols.append(epiweek) return epiweek elif unit == 'month': year_month = date.strftime("%Y-%m") if year_month not in time_cols: time_cols.append(year_month) return year_month elif unit == 'year': year = date.strftime("%Y") if year not in time_cols: time_cols.append(year) return year elif unit == 'full': return 'total' else: if unit == 'full': return 'total' else: return value
def _date_to_api_string(date: date, time_type: str = "day") -> str: # pylint: disable=W0621 """Convert a date object to a YYYYMMDD or YYYYMM string expected by the API.""" if time_type == "day": date_str = date.strftime("%Y%m%d") elif time_type == "week": date_str = Week.fromdate(date).cdcformat() return date_str
def export_csv(df, geo_name, sensor, export_dir, start_date): """Export data set in format expected for injestion by the API. Parameters ---------- df: pd.DataFrame data frame with columns "geo_id", "timestamp", and "val" geo_name: str name of the geographic region, such as "state" or "hrr" sensor: str name of the sensor; only used for naming the output file export_dir: str path to location where the output CSV files to be uploaded should be stored start_date: datetime.datetime The first date to report end_date: datetime.datetime The last date to report """ df = df.copy() df = df[df["timestamp"] >= start_date] for date in df["timestamp"].unique(): t = Week.fromdate(pd.to_datetime(str(date))) date_short = "weekly_" + str(t.year) + str(t.week).zfill(2) export_fn = f"{date_short}_{geo_name}_{sensor}.csv" result_df = df[df["timestamp"] == date][[ "geo_id", "val", "se", "sample_size" ]] result_df.to_csv(f"{export_dir}/{export_fn}", index=False, float_format="%.8f")
def parse_cities_request(self, response): cities = json.loads(response.body) today = date_utils.today() current_week = Week.fromdate(today) # We have to do different passes for 2019 and 2020, since the specific days of # the epidemiological week differs. # # The api seems to return the data from the current year as "2020", and the previous as "2019", # so we'll exploit that to extract the data only from the "2020" chart for city in cities: for year in [2020, 2019]: for weeknum in range(1, current_week.week): ep_week = Week(year, weeknum) # Cache more than 4 weeks ago should_cache = (current_week.week - weeknum) > 4 yield self.make_registral_request( city=city, ep_week=ep_week, callback=self.parse_registral_request, dont_cache=not should_cache, )
def plot_cummulative_sampling_fraction( df ): df["epiweek"] = df["date"].apply( lambda x: Week.fromdate(x).startdate() ) plot_df = df.groupby( "epiweek" ).agg( new_cases = ("new_cases", "sum"), new_sequences = ("new_sequences", "sum" ) ) plot_df = plot_df.loc[plot_df["new_sequences"]>0] plot_df["fraction"] = plot_df["new_sequences"] / plot_df["new_cases"] plot_df = plot_df.reset_index() fig = go.Figure() fig.add_trace( go.Scattergl( x=plot_df["epiweek"], y=plot_df["fraction"], mode='lines', name='Fraction', line={ "color" : '#767676', "width" : 4 } ) ) _add_date_formating( fig ) fig.update_layout( yaxis_tickformat='.1%' ) cleaned_array = np.log10( plot_df.loc[plot_df["fraction"] > 0, "fraction"] ) cleaned_array = cleaned_array[~np.isinf( cleaned_array )] min_lim = np.floor( cleaned_array.min() ) max_lim = np.ceil( cleaned_array.max() ) fig.update_yaxes( type="log", title="<b>Cases sequenced (%)</b>" ) fig.update_xaxes( range=get_date_limits( plot_df["epiweek"] ) ) return fig
def parse_filtered_metadata(metadata_file, tip_to_tree, label_fields, tree_fields, table_fields, database_date_column): query_dict = {} query_id_dict = {} closest_seqs = set() tree_to_tip = defaultdict(list) with open(metadata_file, "r", encoding="utf-8") as f: reader = csv.DictReader(f) headers = reader.fieldnames with open(metadata_file, "r", encoding="utf-8") as f: in_data = csv.DictReader(f) for sequence in in_data: country = sequence["country"] query_id = sequence['query_id'] query_name = sequence['query'] closest_name = sequence["closest"] sample_date = sequence[database_date_column] #this may need to be flexible if using a different background database closest_distance = sequence["SNPdistance"] snps = sequence['SNPs'] if query_id not in query_id_dict: #it's in the fasta file and in the db, this should take the db new_taxon = taxon(query_name, country, label_fields, tree_fields, table_fields) new_taxon.query_id = query_id if query_name == closest_name: #if it's in database, get its sample date new_taxon.in_db = True new_taxon.sample_date = sample_date new_taxon.epiweek = Week.fromdate(convert_date(sample_date)) new_taxon.closest = "NA" else: new_taxon.closest = closest_name new_taxon.closest_distance = closest_distance new_taxon.snps = snps closest_seqs.add(closest_name) if query_name in tip_to_tree: relevant_tree = tip_to_tree[query_name] else: relevant_tree = "NA" new_taxon.tree = relevant_tree tree_to_tip[relevant_tree].append(new_taxon) query_dict[query_name] = new_taxon query_id_dict[query_id] = new_taxon return query_dict, query_id_dict, tree_to_tip, closest_seqs
def get_epiweeks(value): if value[0].isdecimal(): date = pd.to_datetime(value) epiweek = str(Week.fromdate(date, system="cdc")) # get epiweeks epiweek = epiweek[:4] + '_' + 'EW' + epiweek[-2:] if epiweek not in ew_cols: ew_cols.append(epiweek) return epiweek else: return value
def addEpiWeek(self): if 'epiweek' in self.data: self.data.epiweek = self.data.epiweek.astype(str) return self from epiweeks import Week, Year epiweeks = [] for dt in self.data.date: yr, mnth, day = dt.year, dt.month, dt.day epiweek = Week.fromdate(yr, mnth, day) epiweeks.append("{:04d}{:02d}".format(epiweek.year, epiweek.week)) self.data['epiweek'] = epiweeks return self
def get_week_just_from_date(self, date): year_date = date.year year = year_date if date >= self.dates[year_date] and date < self.dates[year_date + 1]: year = year_date else: year = year_date + 1 leap = self.get_leap(year) epi_date = date + timedelta(days=leap.days) epi_week = Week.fromdate(epi_date) return epi_week.week
def regulation_release(state, grid, config, parameters, current_time): # compute the expected monthly release based on Biemans (2011) # TODO this is still written assuming monthly, but here's the epiweek for when that is relevant epiweek = Week.fromdate(current_time).week month = current_time.month streamflow_time_name = config.get( 'water_management.reservoirs.streamflow_time_resolution') # initialize to the average flow state.reservoir_release = grid.reservoir_streamflow_schedule.mean( dim=streamflow_time_name).values # TODO what is k k = state.reservoir_storage_operation_year_start / ( parameters.reservoir_regulation_release_parameter * grid.reservoir_storage_capacity) # TODO what is factor factor = np.where( grid.reservoir_runoff_capacity > parameters.reservoir_runoff_capacity_condition, (2.0 / grid.reservoir_runoff_capacity)**2.0, 0) # release is some combination of prerelease, average flow in the time period, and total average flow state.reservoir_release = np.where( (grid.reservoir_use_electricity > 0) | (grid.reservoir_use_irrigation > 0), np.where( grid.reservoir_runoff_capacity <= 2.0, k * grid.reservoir_prerelease_schedule.sel({ streamflow_time_name: month }).values, k * factor * grid.reservoir_prerelease_schedule.sel({ streamflow_time_name: month }).values + (1 - factor) * grid.reservoir_streamflow_schedule.sel({ streamflow_time_name: month }).values), np.where( grid.reservoir_runoff_capacity <= 2.0, k * grid.reservoir_streamflow_schedule.mean( dim=streamflow_time_name).values, k * factor * grid.reservoir_streamflow_schedule.mean( dim=streamflow_time_name).values + (1 - factor) * grid.reservoir_streamflow_schedule.sel({ streamflow_time_name: month }).values))
def reservoir_release(state, grid, config, parameters, current_time): # compute release from reservoirs # TODO so much logic was dependent on monthly, so still assuming monthly for now, but here's the epiweek for when that is relevant epiweek = Week.fromdate(current_time).week month = current_time.month # if it's the start of the operational year for the reservoir, set it's start of op year storage to the current storage state.reservoir_storage_operation_year_start = np.where( state.reservoir_month_start_operations == month, state.reservoir_storage, state.reservoir_storage_operation_year_start) regulation_release(state, grid, config, parameters, current_time) storage_targets(state, grid, config, parameters, current_time)
def make_objects(metadata_file): #epiweeks = time.make_epiweeks() lineage_objects = [] taxa = [] tax_dict = {} tax_with_dates = [] lineages_to_taxa = defaultdict(list) lin_obj_dict = {} with open(metadata_file) as f: next(f) for l in f: toks = l.strip("\n").split(",") tax_name = toks[0] country = toks[1] date = toks[3] epiweek = toks[4] lin_string = toks[5] metadata = [country, date, epiweek] new_taxon = classes.taxon(tax_name, lin_string, metadata) taxa.append(new_taxon) if new_taxon.date_dt != "NA": tax_with_dates.append(new_taxon) tax_dict[tax_name] = new_taxon lineages_to_taxa[lin_string].append(new_taxon) current_date = sorted(tax_with_dates, key=sortkey2, reverse=True)[0].date_dt current_week = Week.fromdate(current_date) for lin, lin_specific_taxa in lineages_to_taxa.items(): l_o = classes.lineage(lin, lin_specific_taxa, current_date, current_week) lin_obj_dict[lin] = l_o lin_obj_dict = parse_travel_history(lin_obj_dict, tax_dict, metadata_file) return lin_obj_dict, taxa, current_date
def cumulative_seqs_over_time(figdir, locations_to_dates, lineage): dates = [] epiweek_lst = [] for k, v in locations_to_dates.items(): dates.extend(v) date_counts = Counter(dates) seq_number = 0 cum_counts = {} for date, value in sorted(date_counts.items()): seq_number = seq_number + value cum_counts[date] = seq_number for i in dates: epiweek_lst.append(Week.fromdate(i).startdate()) epiweek_counts = Counter(epiweek_lst) sorted_epiweeks = OrderedDict(sorted(epiweek_counts.items())) fig, ax1 = plt.subplots(1, 1, figsize=(12, 4)) ax1.bar(list(sorted_epiweeks.keys()), list(sorted_epiweeks.values()), color="#86b0a6", width=5) ax2 = ax1.twinx() ax2.plot(list(cum_counts.keys()), list(cum_counts.values()), linewidth=3, color="dimgrey") # ylims = (0,4000) ax1.spines['top'].set_visible(False) ax2.spines['top'].set_visible(False) ax1.xaxis.set_tick_params(rotation=90) ax1.set_xlabel("Date") ax2.set_ylabel("Total") ax1.set_ylabel("Sequence count") # ax2.set_ylim(ylims) plt.savefig(os.path.join( figdir, f"Cumulative_sequence_count_over_time_{lineage}.svg"), format='svg', bbox_inches='tight')
def map_to_week(df, date_column='date_today', groupby_target=None): """ map date_today to week_id :param df: dataframe :type df: pandas.DataFrame :param date_column: column name related to date_today :type date_column: str :param groupby_target: group by date_today and sum over thee groupby_target :type groupby_target: None or str or list :return: dataframe with week_id :rtype: pandas.DataFrame """ df[date_column] = df[date_column].apply(lambda x: Week.fromdate(x).enddate() if pd.notna(x) else x) df[date_column] = pd.to_datetime(df[date_column]) if groupby_target is not None: df = df.groupby('date_today', as_index=False)[groupby_target].sum() return df
def date_string_to_epi_day(date_string): """ parse a date string in YYYY-MM-DD format and return cumulative epi day which is cumulative total days since 2019-12-22 """ try: date = datetime.strptime(date_string, '%Y-%m-%d').date() except: return "" # this is epi-week week: week = Week.fromdate(date) # this is day 1 of epi-week 0: day_one = datetime.strptime("2019-12-22", '%Y-%m-%d').date() if week.year < 2019 or (week.year == 2019 and week.week < 52): return "" else: cum_epi_day = (date - day_one).days + 1 return str(cum_epi_day)
def collate_diffs(encoder, regions, typos, mask): """ Stream output from encode_diffs to collate the incidence of each genetic difference by location and date, and return as a tabular data set. :param encoder: generator, returned by encode_diffs() :param regions: dict, counts keyed by region, country and collection date """ res = {} for qname, diffs, missing in filter_outliers(encoder): region, country, coldate = parse_header(qname, regions, typos) if coldate is None: continue coldate = parse_date(coldate) epiweek = Week.fromdate(coldate).week year = coldate.year yeek = '{}|{:02d}'.format(year, epiweek) # update nested dictionaries if region not in res: res.update({region: {}}) if country not in res[region]: res[region].update({country: {}}) if yeek not in res[region][country]: res[region][country].update({yeek: {}}) # iterate through genetic differences in this genome branch = res[region][country][yeek] # shorthand for diff in diffs: typ, pos, alt = diff if typ == '~' and int(pos) in mask and alt in mask[pos]['alt']: # masked substitution continue if typ != '-' and 'N' in alt: # ignore substitutions and insertions with uncalled bases continue key = '|'.join(map(str, diff)) if key not in branch: branch.update({key: 0}) branch[key] += 1 return res
def date_string_to_epi_week(date_string): """ parse a date string in YYYY-MM-DD format and return cumulative epi week which is cumulative total epidemiological weeks since 2019-12-22. Week beginning 2019-12-22 is week 0 """ try: date = datetime.strptime(date_string, '%Y-%m-%d').date() except: return "" # this is epi-week: week = Week.fromdate(date) if week.year < 2019 or (week.year == 2019 and week.week < 52): return "" elif week.year == 2019: return ("0") else: cum_epi_week = week.week + len( list( chain(*[[x for x in Year(y).iterweeks()] for y in range(2020, week.year)]))) return str(cum_epi_week)
def generate_week_periods(open_future_periods, page_limit, begin_period, direction, direction_change): weeks_to_display = {} # When the user first visits the period screen the begin_period variable is empty. # Therefore, use the current week as default. week = Week.thisweek("iso") + open_future_periods # If begin_period variable has a date, use it to calculate the weeks to display. if begin_period != '': week = Week.fromdate(datetime.datetime.strptime(begin_period, '%Y-%m-%d'), 'iso') # This logic is to fix week discrepancy when a user clicks + and changes the direction and press - or vice versa if direction_change: if direction == '+': week += page_limit - 1 if direction == '-': week -= page_limit - 1 # We should not open future dates for data entry. The -1 is to prevent from opening this week. if direction == '+' and week + page_limit > Week.thisweek("iso") + open_future_periods: week = Week.thisweek("iso") + open_future_periods - page_limit - 1 rng = range(page_limit, 0, -1) if direction == '+' else range(page_limit) for key, i in enumerate(rng): w = week + i if direction == '+' else week - (i + 1) weeks_to_display[str(key + 1)] = { "period": w.isoformat(), "display": "W{} - {} - {}".format(w.weektuple()[1], w.startdate(), w.enddate()) } # Take the first week to calculate the beginning period in the next screen. if direction == '+' and i == page_limit: begin_period = str(w.enddate()) # Take the final week to calculate the beginning week in the next screen. if direction == '-' and i == page_limit - 1: begin_period = str(w.startdate()) return begin_period, weeks_to_display
def get_period_from_date(self, year, date): leap = self.get_leap(year) epi_date = date + timedelta(days=leap.days) epi_week = Week.fromdate(epi_date) week = epi_week.week return math.ceil(week / 3)
def storage_targets(state: State, grid: Grid, config: Benedict, parameters: Parameters, current_time: datetime) -> None: """Define the necessary drop in storage based on the reservoir storage targets at the start of the month. Args: state (State): the model state grid (Grid): the model grid config (Config): the model configuration parameters (Parameters): the model parameters current_time (datetime): the current simulation time """ # TODO the logic here is really hard to follow... can it be simplified or made more readable? # TODO this is still written assuming monthly, but here's the epiweek for when that is relevant epiweek = Week.fromdate(current_time).week month = current_time.month streamflow_time_name = config.get( 'water_management.reservoirs.streamflow_time_resolution') # if flood control active and has a flood control start flood_control_condition = (grid.reservoir_use_flood_control > 0) & ( state.reservoir_month_flood_control_start > 0) # modify release in order to maintain a certain storage level month_condition = state.reservoir_month_flood_control_start <= state.reservoir_month_flood_control_end total_condition = flood_control_condition & ( (month_condition & (month >= state.reservoir_month_flood_control_start) & (month < state.reservoir_month_flood_control_end)) | (np.logical_not(month_condition) & (month >= state.reservoir_month_flood_control_start) | (month < state.reservoir_month_flood_control_end))) drop = 0 * state.reservoir_month_flood_control_start n_month = 0 * drop for m in np.arange(1, 13): # TODO assumes monthly m_and_condition = (m >= state.reservoir_month_flood_control_start) & ( m < state.reservoir_month_flood_control_end) m_or_condition = (m >= state.reservoir_month_flood_control_start) | ( m < state.reservoir_month_flood_control_end) drop = np.where( (month_condition & m_and_condition) | (np.logical_not(month_condition) & m_or_condition), np.where( grid.reservoir_streamflow_schedule.sel({ streamflow_time_name: m }).values >= grid.reservoir_streamflow_schedule.mean( dim=streamflow_time_name).values, drop + 0, drop + np.abs( grid.reservoir_streamflow_schedule.mean( dim=streamflow_time_name).values - grid.reservoir_streamflow_schedule.sel({ streamflow_time_name: m }).values)), drop) n_month = np.where((month_condition & m_and_condition) | (np.logical_not(month_condition) & m_or_condition), n_month + 1, n_month) state.reservoir_release = np.where( total_condition & (n_month > 0), state.reservoir_release + drop / n_month, state.reservoir_release) # now need to make sure it will fill up but issue with spilling in certain hydro-climate conditions month_condition = state.reservoir_month_flood_control_end <= state.reservoir_month_start_operations first_condition = flood_control_condition & month_condition & ( (month >= state.reservoir_month_flood_control_end) & (month < state.reservoir_month_start_operations)) second_condition = flood_control_condition & np.logical_not( month_condition) & ( (month >= state.reservoir_month_flood_control_end) | (month < state.reservoir_month_start_operations)) # TODO this logic exists in fortran mosart but isn't used... # fill = 0 * drop # n_month = 0 * drop # for m in np.arange(1,13): # TODO assumes monthly # m_condition = (m >= self.state.reservoir_month_flood_control_end.values) & # (self.reservoir_streamflow_schedule.sel({streamflow_time_name: m}).values > self.reservoir_streamflow_schedule.mean(dim=streamflow_time_name).values) & ( # (first_condition & (m <= self.state.reservoir_month_start_operations)) | # (second_condition & (m <= 12)) # ) # fill = np.where( # m_condition, # fill + np.abs(self.reservoir_streamflow_schedule.mean(dim=streamflow_time_name).values - self.reservoir_streamflow_schedule.sel({streamflow_time_name: m}).values), # fill # ) # n_month = np.where( # m_condition, # n_month + 1, # n_month # ) state.reservoir_release = np.where( (state.reservoir_release > grid.reservoir_streamflow_schedule.mean( dim=streamflow_time_name).values) & (first_condition | second_condition), grid.reservoir_streamflow_schedule.mean( dim=streamflow_time_name).values, state.reservoir_release)
def get_epiweeks(date): date = pd.to_datetime(date) epiweek = str(Week.fromdate(date, system="cdc")) # get epiweeks epiweek = epiweek[:4] + '_' + 'EW' + epiweek[-2:] return epiweek
import pandas as pd from epiweeks import Week, Year df1 = pd.read_csv('data-truth/JHU/truth_JHU-Incident Deaths.csv') df1.rename(columns={'value': 'inc_death'}, inplace=True) df2 = pd.read_csv('data-truth/JHU/truth_JHU-Incident Cases.csv') df2.rename(columns={'value': 'inc_case'}, inplace=True) # merge cases and deaths into one dataframe df = df1.merge(df2, on=['date', 'location', 'location_name']) # add epi weeks for aggregation df.date = pd.to_datetime(df.date) df['epi_week'] = df.date.apply(lambda x: Week.fromdate(x).week) df['epi_year'] = df.date.apply(lambda x: Week.fromdate(x).year) # aggregate to weekly incidence df = df.groupby(['location', 'location_name', 'epi_year', 'epi_week']).aggregate( {'date': max, 'inc_death': sum, 'inc_case':sum}).reset_index() # only keep Saturdays df = df[df.date.dt.day_name() == 'Saturday'] # reformat df = df[['date', 'location', 'location_name', 'inc_case', 'inc_death']].sort_values(['date', 'location']) # export df.to_csv('viz/truth_to_plot.csv', index=False)
def fromDateTime2EW(dt): w = Week.fromdate(dt.year, dt.month, dt.day) return "{:4}{:2}".format(w.year, w.week)
def parse_background_metadata(query_dict, label_fields, tree_fields, table_fields, background_metadata, present_in_tree, closest_sequences, node_summary_option, tip_to_tree, database_name_column, database_sample_date_column, protected_sequences,context_table_summary_field, date_fields, virus): full_tax_dict = query_dict.copy() with open(background_metadata, 'r') as f: reader = csv.DictReader(f) col_name_prep = next(reader) col_names = list(col_name_prep.keys()) old_data = False with open(background_metadata, 'r') as f: in_data = csv.DictReader(f) for sequence in in_data: seq_name = sequence[database_name_column] date = sequence[database_sample_date_column] country = sequence["country"] if "adm2_raw" not in col_names: ##for civet old_data = True if "adm2" in col_names: adm2 = sequence['adm2'] if "|" in adm2: adm2 = "|".join(sorted(adm2.split("|"))) if "location" in col_names: location_label = sequence["location"] else: location_label = adm2 adm2_present_in_background = True else: adm2 = "" location_label = "" adm2_present_in_background = False # if virus == "sars-cov-2": # uk_lineage = sequence["uk_lineage"] # global_lineage = sequence["lineage"] # phylotype = sequence["phylotype"] if node_summary_option == "adm2": if country != "UK": node_summary_trait = country else: node_summary_trait = sequence["adm2"] else: node_summary_trait = sequence[node_summary_option] if (seq_name in present_in_tree or seq_name in closest_sequences) and seq_name not in query_dict.keys(): # if virus == "sars-cov-2": # new_taxon = taxon(seq_name, country, label_fields, tree_fields, table_fields, global_lineage=global_lineage, uk_lineage=uk_lineage, phylotype=phylotype) # else: new_taxon = taxon(seq_name, country, label_fields, tree_fields, table_fields) if date == "": date = "NA" new_taxon.sample_date = date new_taxon.node_summary = node_summary_trait new_taxon.epiweek = Week.fromdate(convert_date(date)) if new_taxon.name in protected_sequences: new_taxon.protected = True if seq_name in tip_to_tree.keys(): new_taxon.tree = tip_to_tree[seq_name] new_taxon.attribute_dict["adm2"] = adm2 new_taxon.attribute_dict["location_label"] = location_label new_taxon.input_display_name = seq_name for field in label_fields: if field in col_names: if sequence[field] != "NA" and sequence[field] != "": #this means it's not in the input file new_taxon.attribute_dict[field] = sequence[field] if context_table_summary_field and context_table_summary_field in col_names: if sequence[context_table_summary_field] != "": new_taxon.attribute_dict["context_table_summary_field"] = sequence[context_table_summary_field] for field in table_fields: if field in col_names: if sequence[field] != "NA" and sequence[field] != "": new_taxon.table_dict[field] = sequence[field] full_tax_dict[seq_name] = new_taxon #There may be sequences not in COG tree but that are in the full metadata, so we want to pull out the additional information if it's not in the input csv if seq_name in query_dict.keys(): tax_object = query_dict[seq_name] if tax_object.sample_date == "NA" and date != "" and date != "NA": tax_object.sample_date = date converted = convert_date(date) tax_object.all_dates.append(converted) tax_object.epiweek = Week.fromdate(converted) if "adm2" not in tax_object.attribute_dict.keys() and adm2 != "": tax_object.attribute_dict["adm2"] = adm2 if "location_label" not in tax_object.attribute_dict.keys() and location_label != "": tax_object.attribute_dict["location_label"] = location_label if context_table_summary_field and context_table_summary_field in col_names: if sequence[context_table_summary_field] != "" and tax_object.attribute_dict["context_table_summary_field"] == "NA": tax_object.attribute_dict["context_table_summary_field"] = sequence[context_table_summary_field] for field in date_fields: if field in reader.fieldnames: if sequence[field] != "" and sequence[field] != "NA" and field not in tax_object.date_dict.keys(): date_dt = convert_date(sequence[field]) tax_object.date_dict[field] = date_dt for field in tree_fields: if field in col_names: if tax_object.attribute_dict[field] == "NA" and sequence[field] != "NA" and sequence[field] != "": #this means it's not in the input file if field != "adm1": tax_object.attribute_dict[field] = sequence[field] else: if country == "UK": adm1 = UK_adm1(tax_object.name,sequence[field]) else: adm1 = "Other" tax_object.attribute_dict[field] = adm1 for field in label_fields: if field in col_names: if tax_object.attribute_dict[field] == "NA" and sequence[field] != "NA" and sequence[field] != "": #this means it's not in the input file tax_object.attribute_dict[field] = sequence[field] for field in table_fields: if field in col_names: if tax_object.table_dict[field] == "NA" and sequence[field] != "NA" and sequence[field] != "": #this means it's not in the input file tax_object.table_dict[field] = sequence[field] # if virus == "sars-cov-2": # tax_object.global_lineage = global_lineage # tax_object.uk_lineage = uk_lineage # tax_object.phylotype = phylotype full_tax_dict[seq_name] = tax_object return full_tax_dict, adm2_present_in_background, old_data
def parse_input_csv(input_csv, query_id_dict, input_column, display_name, sample_date_column, tree_fields, label_fields, table_fields, context_table_summary_field, date_fields=None, UK_adm2_dict=None, patient_id_col=None, reinfection=False): full_query_count = 0 new_query_dict = {} with open(input_csv, 'r') as f: reader = csv.DictReader(f) col_name_prep = next(reader) col_names = list(col_name_prep.keys()) with open(input_csv, 'r') as f: in_data = csv.DictReader(f) #in_data = [r for r in reader] for sequence in in_data: full_query_count += 1 name = sequence[input_column] if name in query_id_dict.keys(): taxon = query_id_dict[name] if reinfection: taxon.attribute_dict["patient"] = sequence[patient_id_col] taxon.input_display_name = sequence[display_name] if reinfection: taxon.attribute_dict["patient"] = sequence[patient_id_col] for field in date_fields: if field in reader.fieldnames: if sequence[field] != "" and sequence[field] != "NA": date_dt = convert_date(sequence[field]) taxon.date_dict[field] = date_dt if sample_date_column in col_names: #if it's not in the background database or there is no date in the background database but date is provided in the input query if sequence[sample_date_column] != "": taxon.sample_date = sequence[sample_date_column] taxon.epiweek = Week.fromdate(convert_date(sequence[sample_date_column])) if context_table_summary_field and context_table_summary_field in col_names: if sequence["context_table_summary_field"] != "": taxon.attribute_dict["context_table_summary_field"] = sequence[context_table_summary_field] for col in col_names: #Add other metadata fields provided if col in table_fields: if sequence[col] != "": taxon.table_dict[col] = sequence[col] if col in label_fields: if sequence[col] != "": taxon.attribute_dict[col] = sequence[col] if col in tree_fields and col != input_column and col != "adm1": if sequence[col] != "": taxon.attribute_dict[col] = sequence[col] if taxon.country == "UK": if col == "adm1": adm1 = UK_adm1(name, sequence[col]) taxon.attribute_dict["adm1"] = adm1 if col == "adm2": adm2 = sequence["adm2"] if "|" in adm2: adm2 = "|".join(sorted(adm2.split("|"))) taxon.attribute_dict["adm2"] = adm2 if "location" in col_names: location_label = sequence["location"] else: location_label = adm2 taxon.attribute_dict["location_label"] = location_label if "adm1" not in col_names and "adm1" in tree_fields: if sequence[col] in UK_adm2_dict.keys(): adm1 = UK_adm2_dict[sequence[col]] taxon.attribute_dict["adm1"] = adm1 new_query_dict[taxon.name] = taxon return new_query_dict, full_query_count
# drop columns where 'country' and 'country_exposure' disagree dfN['same_country'] = np.where(dfN['country'] == dfN['country_exposure'], 'yes', 'no') # compare values dfN.loc[dfN['country_exposure'] == '', 'same_country'] = 'yes' dfN = dfN[dfN['same_country'].apply( lambda x: 'yes' in x)] # exclude rows with conflicting place of origin # print(dfN[['same_country', 'country', 'country_exposure']]) dfN.drop(columns=['same_country']) # print(dfN[['strain', 'date']].iloc[[0, -1]]) # get epiweek end date, create column dfN['date'] = pd.to_datetime(dfN['date'], errors='coerce') dfN['epiweek'] = dfN['date'].apply( lambda x: Week.fromdate(x, system="cdc").enddate()) ## SAMPLE FOCAL AND CONTEXTUAL SEQUENCES purposes = ['focus', 'context'] subsamplers = [] # list of focal and contextual categories for category in purposes: query = {} for idx, val in dfS.loc[dfS['purpose'] == category, 'name'].to_dict().items(): key = dfS.iloc[idx]['level'] if key not in query.keys(): query[key] = [val] else: query[key].append(val) # print(query) subsamplers.append(query)
forecast_start = sys.argv[2] samples_directory = sys.argv[3] import numpy as np from epiweeks import Week, Year num_weeks = 8 data = util.load_state_data() places = sorted(list(data.keys())) #places = ['AK', 'AL'] allQuantiles = [0.01, 0.025] + list(np.arange(0.05, 0.95 + 0.05, 0.05)) + [0.975, 0.99] forecast_date = pd.to_datetime(forecast_start) currentEpiWeek = Week.fromdate(forecast_date) forecast = { 'quantile': [], 'target_end_date': [], 'value': [], 'type': [], 'location': [], 'target': [] } for place in places: prior_samples, mcmc_samples, post_pred_samples, forecast_samples = util.load_samples( place, path=samples_directory) forecast_samples = forecast_samples['mean_z_future'] t = pd.date_range(start=forecast_start,
def get_week_from_date(self, year, date): leap = self.get_leap(year) epi_date = date + timedelta(days=leap.days) epi_week = Week.fromdate(epi_date) return epi_week.week
def plotTop40(config, catalog): from datetime import datetime from epiweeks import Week pivot = 'artist' # we rank artists by most plays, take the top 25 and # add their played tracks to the playlist. #pivot = config['pivot'] if 'pivot' in config else 'artist' result = {} print(f'Grouping by {pivot}') for r in catalog: if r['track'] is None: continue artistid = r[pivot]['name'] if pivot == 'track': artistid += ';;'+r['artist']['name'] week = Week.fromdate(datetime.strptime(r['airdate'], '%Y-%m-%dT%H:%M:%SZ')) if week not in result: result[week]={} if artistid not in result[week]: result[week][artistid]={'track':r, 'plays':set(), 'songs':set()} result[week][artistid]['plays'].add(r['airdate']) # count by unique timestamps. Sometimes the playlist has duplicates. result[week][artistid]['songs'].add((r['artist']['name'],r['track']['name'])) print([w for w in result]) all_results = result if 0: all_artists = [] plots = {} N=10 W=20 weeks = list(sorted(all_results))[-W:] for w in weeks: result = all_results[w] topN = list(sorted(result, key=lambda x: len(result[x]['plays']), reverse=True))[:N] print(topN) for i,a in enumerate(topN): if a not in all_artists: all_artists.append(a) if a not in plots: plots[a]=[] plots[a].append((w,i)) #ranks.append(topN) y0_values = dict((k,N-v+1) for v,k in enumerate(all_artists)) x_values = dict([(k,v+1) for (v,k) in enumerate(weeks)]) import matplotlib.pyplot as plt from math import isnan fig, ax = plt.subplots(figsize=(12,8)) #subplot_kw=dict(axisbg='#EEEEEE')) for a in plots: lookup = dict(plots[a]) X = [x_values[w] for w in weeks] #[x_values[w] for (w,_) in plots[a]] Y = [N-lookup[w] if w in lookup else float('NaN') for w in weeks] #N-v for (_,v) in plots[a]] ax.plot(X, Y, 'o-') for i in range(len(X)): if i==0 or (not isnan(Y[i]) and isnan(Y[i-1])): ax.text(X[i],Y[i]+0.15, a, ha='center', fontsize=8) ax.set_yticks(range(N+1)) ax.set_yticklabels(['']+[str(N-y) for y in range(N)]) plt.show() threshold = Week.fromdate(datetime(2020,5,26)) summary = {} from collections import Counter denom = Counter() for w in all_results: isprior = w<threshold denom[isprior]+=1 current = all_results[w] for a in current: if a not in summary: summary[a]=Counter() summary[a][isprior]+=len(current[a]['plays']) import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(12,8)) labels = [] N=20 for i,a in enumerate(list(sorted(summary, key = lambda x: summary[x][True]+summary[x][False]))[-N:]): ax.plot([2*i-0.5,2*i+0.5], [summary[a][True]/denom[True], summary[a][False]/denom[False]],'o-') labels.append(a) print('{}\t{}\t{}'.format(a, summary[a][True]/denom[True], summary[a][False]/denom[False])) ax.set_xticks([2*i for i in range(N)]) ax.set_xticklabels(labels, rotation=90) ax.set_ylabel('Plays per Week') plt.tight_layout() plt.savefig('playsPerWeek.png') plt.show() if 0: import mpld3 from math import log, ceil data = [(x,y,ctr[(x,y)]) for (x,y) in ctr] x_track=[d[0] for d in data] y_artist=[d[1] for d in data] count = [len(d[2]) for d in data] labels = [','.join(d[2]) for d in data] fig, ax = plt.subplots(figsize=(4,2)) #subplot_kw=dict(axisbg='#EEEEEE')) scatter = ax.scatter(x_track, y_artist, s=[3*log(c+1) for c in count]) xint = range(min(x_track), ceil(max(x_track))+1) #ax.set_xticks(xint) ax.set_xlabel('Track Plays') ax.set_ylabel('Artist Plays') tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels) mpld3.plugins.connect(fig, tooltip) mpld3.show()